-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract.py
More file actions
71 lines (59 loc) · 3.01 KB
/
extract.py
File metadata and controls
71 lines (59 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import json
import pandas as pd
import re
from tqdm import tqdm
def create_mega_dataframe(instances_path, figures_path, sources_path):
with open(instances_path, 'r', encoding='utf-8') as f:
instances = json.load(f)
with open(figures_path, 'r', encoding='utf-8') as f:
figures = json.load(f)
with open(sources_path, 'r', encoding='utf-8') as f:
sources = json.load(f)
fig_map = {f['id']: f for f in figures}
src_map = {s['id']: s for s in sources}
rows = []
for inst in tqdm(instances, desc="Processing Instances"):
text = inst.get('text', '')
text = text.replace(" ", " ").replace('\ufeff', '')
clean_text = re.sub(r'<[^>]+>', '', text)
source_id = inst.get('source', {}).get('id')
source_data = src_map.get(source_id, {}).get('cache', {})
creators = source_data.get('creators', [])
formatted_authors = []
for c in creators:
if 'firstName' in c and 'lastName' in c:
formatted_authors.append(f"{c['lastName']}, {c['firstName']}")
elif 'name' in c:
formatted_authors.append(c['name'])
all_authors = "; ".join(formatted_authors) if formatted_authors else "Unknown"
for anno in inst.get('annotations', []):
fig_id = anno.get('figure', {}).get('id')
fig_info = fig_map.get(fig_id, {})
current_anno_id = anno.get('id')
label_groups = {}
index_groups = {}
for hl in anno.get('highlights', []):
label_val = hl.get('label', 0)
start, end = hl.get('start', 0), hl.get('end', 0)
snippet = clean_text[start:end]
label_groups.setdefault(label_val, []).append(snippet)
index_groups.setdefault(label_val, []).append((start, end))
# Creating one row per label group within the annotation
if fig_info.get('allow_in_gofigure', True):
for label_key, snippets in label_groups.items():
rows.append({
"highlights": "; ".join(snippets),
"highlight_indices": json.dumps(index_groups.get(label_key, [])),
"figure_name": fig_info.get('name'),
"figure_type": fig_info.get('type', {}).get('name') if isinstance(fig_info.get('type'), dict) else "N/A",
"authors": all_authors,
"source_title": source_data.get('title'),
"source_year": source_data.get('date'),
"full_text": clean_text,
"instance_id": inst.get('id'),
"annotation_id": current_anno_id,
"ref": inst.get('ref')
})
return pd.DataFrame(rows)
df = create_mega_dataframe('./input/instances.json', './input/figures.json', './input/sources.json')
df.to_csv('./training/rhetorical_analysis_export_gofigure.csv', index=False)