figdetect/extract.py at master · aadium/figdetect · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import json
import pandas as pd
import re
from tqdm import tqdm

def create_mega_dataframe(instances_path, figures_path, sources_path):
    with open(instances_path, 'r', encoding='utf-8') as f:
        instances = json.load(f)
    with open(figures_path, 'r', encoding='utf-8') as f:
        figures = json.load(f)
    with open(sources_path, 'r', encoding='utf-8') as f:
        sources = json.load(f)

    fig_map = {f['id']: f for f in figures}
    src_map = {s['id']: s for s in sources}

    rows = []

    for inst in tqdm(instances, desc="Processing Instances"):
        text = inst.get('text', '')
        text = text.replace("&nbsp;", " ").replace('\ufeff', '')
        clean_text = re.sub(r'<[^>]+>', '', text)

        source_id = inst.get('source', {}).get('id')
        source_data = src_map.get(source_id, {}).get('cache', {})

        creators = source_data.get('creators', [])
        formatted_authors = []
        for c in creators:
            if 'firstName' in c and 'lastName' in c:
                formatted_authors.append(f"{c['lastName']}, {c['firstName']}")
            elif 'name' in c:
                formatted_authors.append(c['name'])
        all_authors = "; ".join(formatted_authors) if formatted_authors else "Unknown"

        for anno in inst.get('annotations', []):
            fig_id = anno.get('figure', {}).get('id')
            fig_info = fig_map.get(fig_id, {})
            current_anno_id = anno.get('id')

            label_groups = {}
            index_groups = {}
            for hl in anno.get('highlights', []):
                label_val = hl.get('label', 0)
                start, end = hl.get('start', 0), hl.get('end', 0)
                snippet = clean_text[start:end]

                label_groups.setdefault(label_val, []).append(snippet)
                index_groups.setdefault(label_val, []).append((start, end))

            # Creating one row per label group within the annotation
            if fig_info.get('allow_in_gofigure', True):
                for label_key, snippets in label_groups.items():
                    rows.append({
                        "highlights": "; ".join(snippets),
                        "highlight_indices": json.dumps(index_groups.get(label_key, [])),
                        "figure_name": fig_info.get('name'),
                        "figure_type": fig_info.get('type', {}).get('name') if isinstance(fig_info.get('type'), dict) else "N/A",
                        "authors": all_authors,
                        "source_title": source_data.get('title'),
                        "source_year": source_data.get('date'),
                        "full_text": clean_text,
                        "instance_id": inst.get('id'),
                        "annotation_id": current_anno_id,
                        "ref": inst.get('ref')
                    })

    return pd.DataFrame(rows)

df = create_mega_dataframe('./input/instances.json', './input/figures.json', './input/sources.json')
df.to_csv('./training/rhetorical_analysis_export_gofigure.csv', index=False)