jinjiajie commited on
Commit
05195c1
·
verified ·
1 Parent(s): 4d69f83

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +64 -3
  2. seed.json +258 -0
app.py CHANGED
@@ -21,6 +21,7 @@ RESULTS_REPO = "RUC-NLPIR/GISA-leaderboard"
21
  META_FILE = "encrypted_question.jsonl"
22
  ANSWER_DIR = "answer"
23
  CACHE_DIR = "cache/answers"
 
24
 
25
  ASSETS_DIR = os.path.join(os.path.dirname(__file__), "assets")
26
  INDEX_HTML = os.path.join(ASSETS_DIR, "index.html")
@@ -105,10 +106,17 @@ def load_results_dataset():
105
 
106
  def build_leaderboard_rows() -> List[dict]:
107
  ds = load_results_dataset()
108
- if ds is None:
109
- return []
 
 
 
 
 
 
 
110
  rows: List[dict] = []
111
- for row in ds:
112
  rows.append(
113
  {
114
  "model": row.get("model", "-"),
@@ -130,6 +138,17 @@ def build_leaderboard_rows() -> List[dict]:
130
  return rows
131
 
132
 
 
 
 
 
 
 
 
 
 
 
 
133
  def render_page() -> str:
134
  html = _load_text(INDEX_HTML)
135
  html = html.replace("__LEADERBOARD_DATA__", "")
@@ -155,6 +174,47 @@ def ensure_results_repo():
155
  )
156
 
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  def _get_metric(summary: dict, qtype: str, key: str, fallback: float = 0.0) -> float:
159
  return float(summary.get(qtype, {}).get(key, fallback) or 0.0)
160
 
@@ -299,6 +359,7 @@ def add_new_eval(
299
  return format_log("Submission received! Please refresh the leaderboard to see your score.")
300
 
301
 
 
302
  leaderboard_data = build_leaderboard_rows()
303
  css = _load_text(STYLES_CSS)
304
  html = render_page()
 
21
  META_FILE = "encrypted_question.jsonl"
22
  ANSWER_DIR = "answer"
23
  CACHE_DIR = "cache/answers"
24
+ SEED_FILE = os.path.join(os.path.dirname(__file__), "seed.json")
25
 
26
  ASSETS_DIR = os.path.join(os.path.dirname(__file__), "assets")
27
  INDEX_HTML = os.path.join(ASSETS_DIR, "index.html")
 
106
 
107
  def build_leaderboard_rows() -> List[dict]:
108
  ds = load_results_dataset()
109
+ if ds is None or len(ds) == 0:
110
+ seed_rows = load_seed_rows()
111
+ if not seed_rows:
112
+ return []
113
+ return _rows_from_source(seed_rows)
114
+ return _rows_from_source(ds)
115
+
116
+
117
+ def _rows_from_source(source) -> List[dict]:
118
  rows: List[dict] = []
119
+ for row in source:
120
  rows.append(
121
  {
122
  "model": row.get("model", "-"),
 
138
  return rows
139
 
140
 
141
+ def load_seed_rows() -> List[dict]:
142
+ if not os.path.exists(SEED_FILE):
143
+ return []
144
+ try:
145
+ with open(SEED_FILE, "r", encoding="utf-8") as f:
146
+ data = json.load(f)
147
+ return data if isinstance(data, list) else []
148
+ except Exception:
149
+ return []
150
+
151
+
152
  def render_page() -> str:
153
  html = _load_text(INDEX_HTML)
154
  html = html.replace("__LEADERBOARD_DATA__", "")
 
174
  )
175
 
176
 
177
+ def seed_results_if_needed():
178
+ seed_rows = load_seed_rows()
179
+ if not seed_rows:
180
+ return
181
+ ds = load_results_dataset()
182
+ if ds is not None and len(ds) > 0:
183
+ return
184
+ if not TOKEN:
185
+ return
186
+
187
+ entries = []
188
+ for row in seed_rows:
189
+ entries.append(
190
+ {
191
+ "model": row.get("model", "-"),
192
+ "org": row.get("org", "-"),
193
+ "framework": row.get("framework", "N/A"),
194
+ "date": row.get("date", "-"),
195
+ "overall_em": _safe_float(row.get("overall")),
196
+ "item_em": _safe_float(row.get("item_em")),
197
+ "set_em": _safe_float(row.get("set_em")),
198
+ "set_f1": _safe_float(row.get("set_f1")),
199
+ "list_em": _safe_float(row.get("list_em")),
200
+ "list_f1": _safe_float(row.get("list_f1")),
201
+ "list_order": _safe_float(row.get("list_order")),
202
+ "table_em": _safe_float(row.get("table_em")),
203
+ "table_row_f1": _safe_float(row.get("table_row_f1")),
204
+ "table_item_f1": _safe_float(row.get("table_item_f1")),
205
+ "url": row.get("url", ""),
206
+ "email": row.get("email", ""),
207
+ "username": row.get("username", "seed"),
208
+ }
209
+ )
210
+
211
+ try:
212
+ ensure_results_repo()
213
+ Dataset.from_list(entries).push_to_hub(RESULTS_REPO, token=TOKEN)
214
+ except Exception:
215
+ pass
216
+
217
+
218
  def _get_metric(summary: dict, qtype: str, key: str, fallback: float = 0.0) -> float:
219
  return float(summary.get(qtype, {}).get(key, fallback) or 0.0)
220
 
 
359
  return format_log("Submission received! Please refresh the leaderboard to see your score.")
360
 
361
 
362
+ seed_results_if_needed()
363
  leaderboard_data = build_leaderboard_rows()
364
  css = _load_text(STYLES_CSS)
365
  html = render_page()
seed.json ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model": "Qwen3-235B-A22B (thinking)",
4
+ "org": "Alibaba Cloud",
5
+ "framework": "ReAct",
6
+ "date": "2025.7",
7
+ "overall": 9.65,
8
+ "item_em": 40.91,
9
+ "set_em": 18.0,
10
+ "set_f1": 52.37,
11
+ "list_em": 14.58,
12
+ "list_f1": 36.48,
13
+ "list_order": 35.96,
14
+ "table_em": 4.35,
15
+ "table_row_f1": 28.32,
16
+ "table_item_f1": 43.93
17
+ },
18
+ {
19
+ "model": "Claude 4.5 Sonnet (non-thinking)",
20
+ "org": "Anthropic AI",
21
+ "framework": "ReAct",
22
+ "date": "2025.9",
23
+ "overall": 16.36,
24
+ "item_em": 59.09,
25
+ "set_em": 26.0,
26
+ "set_f1": 60.87,
27
+ "list_em": 22.92,
28
+ "list_f1": 58.76,
29
+ "list_order": 57.78,
30
+ "table_em": 9.49,
31
+ "table_row_f1": 47.85,
32
+ "table_item_f1": 63.71
33
+ },
34
+ {
35
+ "model": "Claude 4.5 Sonnet (thinking)",
36
+ "org": "Anthropic AI",
37
+ "framework": "ReAct",
38
+ "date": "2025.9",
39
+ "overall": 19.3,
40
+ "item_em": 63.64,
41
+ "set_em": 28.0,
42
+ "set_f1": 64.86,
43
+ "list_em": 22.92,
44
+ "list_f1": 59.24,
45
+ "list_order": 56.42,
46
+ "table_em": 13.04,
47
+ "table_row_f1": 49.92,
48
+ "table_item_f1": 65.17
49
+ },
50
+ {
51
+ "model": "Gemini 3 Pro (low)",
52
+ "org": "Google",
53
+ "framework": "ReAct",
54
+ "date": "2025.11",
55
+ "overall": 14.74,
56
+ "item_em": 45.45,
57
+ "set_em": 28.0,
58
+ "set_f1": 63.82,
59
+ "list_em": 27.08,
60
+ "list_f1": 57.55,
61
+ "list_order": 56.37,
62
+ "table_em": 7.11,
63
+ "table_row_f1": 45.93,
64
+ "table_item_f1": 64.93
65
+ },
66
+ {
67
+ "model": "Gemini 3 Pro (high)",
68
+ "org": "Google",
69
+ "framework": "ReAct",
70
+ "date": "2025.11",
71
+ "overall": 15.28,
72
+ "item_em": 50.0,
73
+ "set_em": 22.0,
74
+ "set_f1": 62.66,
75
+ "list_em": 27.08,
76
+ "list_f1": 60.87,
77
+ "list_order": 60.12,
78
+ "table_em": 8.7,
79
+ "table_row_f1": 47.01,
80
+ "table_item_f1": 66.02
81
+ },
82
+ {
83
+ "model": "GPT-5.2 (thinking)",
84
+ "org": "OpenAI",
85
+ "framework": "ReAct",
86
+ "date": "2025.11",
87
+ "overall": 15.82,
88
+ "item_em": 63.64,
89
+ "set_em": 26.0,
90
+ "set_f1": 62.7,
91
+ "list_em": 16.67,
92
+ "list_f1": 54.11,
93
+ "list_order": 53.17,
94
+ "table_em": 9.49,
95
+ "table_row_f1": 43.04,
96
+ "table_item_f1": 60.2
97
+ },
98
+ {
99
+ "model": "DeepSeek-V3.2 (non-thinking)",
100
+ "org": "DeepSeek AI",
101
+ "framework": "ReAct",
102
+ "date": "2025.12",
103
+ "overall": 11.53,
104
+ "item_em": 22.73,
105
+ "set_em": 20.0,
106
+ "set_f1": 52.0,
107
+ "list_em": 22.92,
108
+ "list_f1": 56.02,
109
+ "list_order": 55.45,
110
+ "table_em": 6.72,
111
+ "table_row_f1": 44.14,
112
+ "table_item_f1": 62.24
113
+ },
114
+ {
115
+ "model": "DeepSeek-V3.2 (thinking)",
116
+ "org": "DeepSeek AI",
117
+ "framework": "ReAct",
118
+ "date": "2025.12",
119
+ "overall": 14.47,
120
+ "item_em": 63.64,
121
+ "set_em": 28.0,
122
+ "set_f1": 60.79,
123
+ "list_em": 20.83,
124
+ "list_f1": 62.25,
125
+ "list_order": 60.41,
126
+ "table_em": 6.32,
127
+ "table_row_f1": 43.44,
128
+ "table_item_f1": 62.42
129
+ },
130
+ {
131
+ "model": "GLM-4.7 (thinking)",
132
+ "org": "Z.AI",
133
+ "framework": "ReAct",
134
+ "date": "2025.12",
135
+ "overall": 14.21,
136
+ "item_em": 50.0,
137
+ "set_em": 22.0,
138
+ "set_f1": 59.44,
139
+ "list_em": 20.83,
140
+ "list_f1": 51.99,
141
+ "list_order": 50.97,
142
+ "table_em": 8.3,
143
+ "table_row_f1": 43.97,
144
+ "table_item_f1": 61.28
145
+ },
146
+ {
147
+ "model": "Seed-1.8 (thinking)",
148
+ "org": "ByteDance Seed",
149
+ "framework": "ReAct",
150
+ "date": "2025.12",
151
+ "overall": 13.4,
152
+ "item_em": 45.45,
153
+ "set_em": 32.0,
154
+ "set_f1": 56.77,
155
+ "list_em": 16.67,
156
+ "list_f1": 56.11,
157
+ "list_order": 53.54,
158
+ "table_em": 6.32,
159
+ "table_row_f1": 38.49,
160
+ "table_item_f1": 57.13
161
+ },
162
+ {
163
+ "model": "Qwen3-Max (thinking)",
164
+ "org": "Alibaba Cloud",
165
+ "framework": "ReAct",
166
+ "date": "2026.1",
167
+ "overall": 17.96,
168
+ "item_em": 59.09,
169
+ "set_em": 30.0,
170
+ "set_f1": 63.45,
171
+ "list_em": 25.0,
172
+ "list_f1": 66.51,
173
+ "list_order": 64.08,
174
+ "table_em": 10.67,
175
+ "table_row_f1": 48.48,
176
+ "table_item_f1": 66.86
177
+ },
178
+ {
179
+ "model": "Kimi K2.5 (thinking)",
180
+ "org": "Moonshot AI",
181
+ "framework": "ReAct",
182
+ "date": "2026.1",
183
+ "overall": 15.55,
184
+ "item_em": 68.18,
185
+ "set_em": 28.0,
186
+ "set_f1": 61.71,
187
+ "list_em": 18.75,
188
+ "list_f1": 50.52,
189
+ "list_order": 48.81,
190
+ "table_em": 7.91,
191
+ "table_row_f1": 45.19,
192
+ "table_item_f1": 61.23
193
+ },
194
+ {
195
+ "model": "GPT-4o Search Preview",
196
+ "org": "OpenAI",
197
+ "framework": "N/A",
198
+ "date": "2025.3",
199
+ "overall": 5.63,
200
+ "item_em": 13.64,
201
+ "set_em": 4.0,
202
+ "set_f1": 38.7,
203
+ "list_em": 8.33,
204
+ "list_f1": 36.65,
205
+ "list_order": 36.0,
206
+ "table_em": 4.74,
207
+ "table_row_f1": 29.59,
208
+ "table_item_f1": 45.61
209
+ },
210
+ {
211
+ "model": "OpenAI o4 Mini Deep Research",
212
+ "org": "OpenAI",
213
+ "framework": "N/A",
214
+ "date": "2025.6",
215
+ "overall": 7.78,
216
+ "item_em": 18.18,
217
+ "set_em": 14.0,
218
+ "set_f1": 63.03,
219
+ "list_em": 18.75,
220
+ "list_f1": 53.72,
221
+ "list_order": 52.59,
222
+ "table_em": 3.56,
223
+ "table_row_f1": 36.78,
224
+ "table_item_f1": 56.47
225
+ },
226
+ {
227
+ "model": "Perplexity Sonar Pro Search",
228
+ "org": "Perplexity AI",
229
+ "framework": "N/A",
230
+ "date": "-",
231
+ "overall": 7.51,
232
+ "item_em": 22.73,
233
+ "set_em": 20.0,
234
+ "set_f1": 47.04,
235
+ "list_em": 6.25,
236
+ "list_f1": 34.74,
237
+ "list_order": 33.16,
238
+ "table_em": 3.95,
239
+ "table_row_f1": 34.76,
240
+ "table_item_f1": 49.05
241
+ },
242
+ {
243
+ "model": "Google Search AI Mode",
244
+ "org": "Google",
245
+ "framework": "N/A",
246
+ "date": "-",
247
+ "overall": 9.38,
248
+ "item_em": 31.82,
249
+ "set_em": 20.0,
250
+ "set_f1": 46.34,
251
+ "list_em": 8.33,
252
+ "list_f1": 40.64,
253
+ "list_order": 39.36,
254
+ "table_em": 5.53,
255
+ "table_row_f1": 31.15,
256
+ "table_item_f1": 50.79
257
+ }
258
+ ]