-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathngrams.py
More file actions
37 lines (34 loc) · 1.39 KB
/
ngrams.py
File metadata and controls
37 lines (34 loc) · 1.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import nltk
import re
import math
import numpy
import copy
import json
from nltk.tokenize.texttiling import TextTilingTokenizer
from textblob import TextBlob
def _create_ngrams_table(tokseqs, n = 2):
ngram_table = {}
current_tok_seq = 0
tt = nltk.tokenize.texttiling.TokenTableField
for ts in tokseqs:
ts_tb = TextBlob(' '.join([word[0] for word in ts.wrdindex_list]))
ngrams = nltk.bigrams([word[0] for word in ts.wrdindex_list])
if n == 3:
ngrams = nltk.trigrams([word[0] for word in ts.wrdindex_list])
for ngram in ngrams:
if ngram in ngram_table:
ngram_table[ngram].total_count += 1
if ngram_table[ngram].last_tok_seq != current_tok_seq:
ngram_table[ngram].last_tok_seq = current_tok_seq
ngram_table[ngram].ts_occurences.append([current_tok_seq,1])
else:
ngram_table[ngram].ts_occurences[-1][1] += 1
else: #new word
ngram_table[ngram] = tt(first_pos=0,
ts_occurences=[[current_tok_seq,1]],
total_count=1,
par_count=1,
last_par=0,
last_tok_seq=current_tok_seq)
current_tok_seq += 1
return ngram_table