Skip to content

Commit 76537eb

Browse files
Peter JohnsonPeter Johnson
authored andcommitted
ngram letter models added
1 parent e217985 commit 76537eb

File tree

8 files changed

+505908
-4
lines changed

8 files changed

+505908
-4
lines changed

evaluation_function/.DS_Store

0 Bytes
Binary file not shown.

evaluation_function/dev.json

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,13 @@
99
"answer": "2.0",
1010
"response": "3.0",
1111
"model": "shannon_letters_single",
12-
"uniform": false,
12+
"uniform": true,
13+
"word_count": "random"
14+
},
15+
"shannon_letters_ngram": {
16+
"answer": "2.0",
17+
"response": "3.0",
18+
"model": "shannon_letters_ngram",
1319
"word_count": "random"
1420
}
1521
}

evaluation_function/dev.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ def dev():
3737
params = Params(**config)
3838

3939
result = evaluation_function(answer, response, params)
40-
4140
print(result.to_dict())
4241

4342
if __name__ == "__main__":
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from . import basic_nn
22
from . import shannon_letters_single
3+
from . import shannon_letters_ngram
34

4-
__all__ = ["basic_nn", "shannon_letters_single"]
5+
__all__ = ["basic_nn", "shannon_letters_single", "shannon_letters_ngram"]
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import random
2+
import csv
3+
import os
4+
from pathlib import Path
5+
from io import StringIO
6+
import re
7+
8+
from lf_toolkit.evaluation import Result, Params
9+
10+
printing=0
11+
12+
# Setup paths for saving/loading model and data
13+
BASE_DIR = Path(__file__).resolve().parent
14+
MODEL_DIR = Path(os.environ.get("MODEL_DIR", BASE_DIR / "storage"))
15+
MODEL_DIR.mkdir(parents=True, exist_ok=True)
16+
LETTERS_PATH = MODEL_DIR / "norvig_letter_ngrams.csv"
17+
WORD_LENGTHS_PATH = MODEL_DIR / "norvig_word_length_frequencies.csv"
18+
19+
# Shannon's English lagnuage generator using letter frequency
20+
21+
# Relative Frequencies of Letters in General English Plain text From Cryptographical Mathematics, by Robert Edward Lewand
22+
# https://web.archive.org/web/20080708193159/http://pages.central.edu/emp/LintonT/classes/spring01/cryptography/letterfreq.html
23+
24+
import csv, re, random
25+
26+
def read_multingram_csv(filename: str):
27+
lookups = {}
28+
current_n = None
29+
30+
with open(filename, newline='') as f:
31+
reader = csv.reader(f)
32+
for row in reader:
33+
first = row[0].strip()
34+
if first.endswith("-gram"):
35+
current_n = int(first.split("-")[0])
36+
lookups[current_n] = {}
37+
continue
38+
39+
key, freq = first, float(row[1])
40+
prefix = key[:current_n - 1] if current_n > 1 else ""
41+
42+
if prefix not in lookups[current_n]:
43+
lookups[current_n][prefix] = {"keys": [], "freqs": []}
44+
45+
lookups[current_n][prefix]["keys"].append(key)
46+
lookups[current_n][prefix]["freqs"].append(freq)
47+
48+
return lookups
49+
50+
NGRAM_LOOKUPS = read_multingram_csv(LETTERS_PATH)
51+
52+
def sample_ngram(lookups, n, prefix="", k=1):
53+
data = lookups[n][prefix]
54+
return random.choices(data["keys"], weights=data["freqs"], k=k)
55+
56+
57+
def generate_word(n) -> str: # n is the number of letters in the word
58+
lookups = NGRAM_LOOKUPS
59+
n_max=n
60+
samples = {}
61+
samples[1] = sample_ngram(lookups, n=1, prefix="", k=1)[0]
62+
print("1-gram:", samples[1]) if printing == 1 else None
63+
for i in range(2, n+1):
64+
if len(lookups)<=i: # no i-grams available → stop
65+
samples[i] = samples[i-1]+'#'
66+
n_max=i
67+
break
68+
prefix = samples[i-1] # previous (i-1)-gram is the prefix
69+
if prefix not in lookups[i]: # missing bucket → stop
70+
if i>2:
71+
samples[i] = samples[i-1]+"#"
72+
n_max=i
73+
else:
74+
samples[i] = "#"
75+
break
76+
else:
77+
samples[i] = sample_ngram(lookups, n=i, prefix=prefix, k=1)[0]
78+
print(f"{i}-gram:", samples[i]) if printing == 1 else None
79+
80+
return samples[n_max]
81+
82+
def csv_to_lists(filename: str) -> list:
83+
frequencies = []
84+
with open(filename, newline='') as csvfile:
85+
reader = csv.reader(csvfile)
86+
next(reader) # Skip header row
87+
for key,value in reader:
88+
frequencies.append([key, float(value)])
89+
return frequencies
90+
91+
def run(response, answer, params:Params) -> Result:
92+
output=[]
93+
data = csv_to_lists(WORD_LENGTHS_PATH)
94+
word_lengths = {}
95+
word_lengths["tokens"] = [row[0] for row in data]
96+
word_lengths["weights"] = [row[1] for row in data]
97+
word_count = params.get("word_count", 10)
98+
if word_count == "random":
99+
word_count = random.randint(3,15)
100+
for i in range(word_count):
101+
k=int(random.choices(word_lengths["tokens"],weights=word_lengths["weights"],k=1)[0])
102+
output.append(generate_word(k))
103+
output=' '.join(output)
104+
is_correct = True
105+
return Result(is_correct=is_correct,feedback_items=[("general",output)])

evaluation_function/models/shannon_letters_single.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
BASE_DIR = Path(__file__).resolve().parent
1212
MODEL_DIR = Path(os.environ.get("MODEL_DIR", BASE_DIR / "storage"))
1313
MODEL_DIR.mkdir(parents=True, exist_ok=True)
14-
LETTERS_PATH = MODEL_DIR / "norvig_letter_frequencies.csv"
14+
LETTERS_PATH = MODEL_DIR / "norvig_letter_single.csv"
1515
WORD_LENGTHS_PATH = MODEL_DIR / "norvig_word_length_frequencies.csv"
1616

1717
# Relative Frequencies of Letters in General English Plain text From Cryptographical Mathematics, by Robert Edward Lewand

0 commit comments

Comments
 (0)