Repository for the paper "CourtReasoner: Can LLM Agents Reason Like Judges?".
CourtReasoner/
├── courtreasoner-data/ # CourtReasoner benchmark
├── meta-evaluation/
│ ├── meta-eval.py # Meta-evaluation script with large language models.
│ └── run_metaeval.sh # Bash script for running meta-evaluation itself.
├── requirements.txt
└── README.md
pip3 install requirements.txt
python gpt-eval.py run-eval-for-folder export/Gemini\ Deleting\ Most-Important\ Fact
python gpt-eval.py compute-scores-for-folder export/Gemini\ Deleting\ Most-Important\ Fact
python gpt-eval.py run-eval-for-folder export/Gemini\ Deleting\ Second\ Most-Important\ Fact
python gpt-eval.py compute-scores-for-folder export/Gemini\ Deleting\ Second\ Most-Important\ Fact@article{han2025courtreasoner,
title={CourtReasoner: Can LLM Agents Reason Like Judges?},
author={Simeng Han, Yoshiki Takashima, Shannon Zejiang Shen, Chen Liu, Yixin Liu, Roque K. Thuo, Sonia Knowlton, Ruzica Piskac, Scott Shapiro, Arman Cohan},
journal={The 2025 Conference on Empirical Methods in Natural Language Processing},
year={2025}
}