generated from SalesforceAIResearch/oss-template
-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathpreprocess.py
More file actions
executable file
·107 lines (85 loc) · 2.65 KB
/
preprocess.py
File metadata and controls
executable file
·107 lines (85 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python3
"""
Preprocess research reports from MD files to JSON format.
Usage:
# Preprocess single experiment
python preprocess.py exp_name1 exp_name2 --output-dir extracted_reports/
# With custom base path
python preprocess.py exp_name --base-path /path/to/reports/ --output-dir out/
"""
"""
Preprocess research reports from directory structure to JSON.
Expected directory structure:
base_path/
model_name_1/
qid_<qid>_report.md
model_name_2/
qid_<qid>_report.md
...
Example usage:
# Process all models in directory
python preprocess.py /path/to/model_outputs
# Process specific models only
python preprocess.py /path/to/model_outputs -m gpt-5-search gemini-pro
# With custom output directory
python preprocess.py /path/to/model_outputs -o my_reports
"""
import argparse
import sys
from liveresearchbench.common.io_utils import preprocess_reports
def main():
parser = argparse.ArgumentParser(
description="Preprocess research reports from directory structure to JSON",
epilog="""
Expected directory structure:
base_path/
model_name_1/
qid_<qid>_report.md
model_name_2/
qid_<qid>_report.md
...
""",
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
"base_path",
help="Base directory containing model subdirectories (each with qid_*_report.md files)"
)
parser.add_argument(
"-m", "--models",
nargs="+",
help="Specific model names to process (default: process all subdirectories)"
)
parser.add_argument(
"-o", "--output-dir",
default="extracted_reports",
help="Output directory for JSON file (default: extracted_reports)"
)
parser.add_argument(
"--use-realtime",
action="store_true",
help="Replace temporal placeholders ({{current_year}}, etc.) with current values"
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Enable verbose logging"
)
args = parser.parse_args()
try:
json_file = preprocess_reports(
base_path=args.base_path,
model_names=args.models,
output_dir=args.output_dir,
verbose=args.verbose,
use_realtime=args.use_realtime
)
return 0
except Exception as e:
print(f"\n❌ Error during preprocessing: {e}")
import traceback
if args.verbose:
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())