-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathphonetize.py
More file actions
51 lines (41 loc) · 1.73 KB
/
phonetize.py
File metadata and controls
51 lines (41 loc) · 1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
from phonemizer import phonemize
from phonemizer.separator import Separator
def generate_phonetic_csv(input_file, output_file):
df = pd.read_csv(input_file)
df['full_text'] = df['full_text'].fillna("")
df['highlights'] = df['highlights'].fillna("")
separator = Separator(phone=" ", word=" <W> ")
# 1. Phonemize Full Text (Safe bulk operation)
print("Transcribing full_text...")
df['full_text_phonetic'] = phonemize(
df['full_text'].tolist(),
language='en-us', backend='espeak', separator=separator,
strip=True, njobs=5,
preserve_empty_lines=True
)
# 2. Optimized Highlights Phonemization
print("Preparing highlights for bulk transcription...")
all_parts = []
row_map = []
for i, entry in enumerate(df['highlights'].tolist()):
parts = entry.split(';')
for p in parts:
all_parts.append(p)
row_map.append(i)
print(f"Transcribing {len(all_parts)} highlight fragments in bulk...")
phonemized_all = phonemize(
all_parts,
language='en-us', backend='espeak', separator=separator,
strip=True, njobs=5, preserve_empty_lines=True
)
# 3. Reconstruct the rows with semicolons
print("Reconstructing rows...")
reconstructed = [[] for _ in range(len(df))]
for ph_part, original_row_idx in zip(phonemized_all, row_map):
reconstructed[original_row_idx].append(ph_part)
df['highlights_phonetic'] = ["; ".join(r) for r in reconstructed]
df.to_csv(output_file, index=False)
print(f"Success! Saved to {output_file}")
if __name__ == "__main__":
generate_phonetic_csv('./training/rhetorical_analysis_english_gofigure.csv', './training/gofigure_phonetized.csv')