crownpku · mirfan899 · Oct 5, 2021
diff --git a/README.md b/README.md
@@ -19,10 +19,10 @@ import text2vec
 
 * input: List of Documents, doc_list is a list of documents/paragraphs/sentences.
 ```
-t2v = text2vec.text2vec(doc_list)
+t2v = text2vec.Text2Vec(doc_list)
 ```
 
-* output: List of Vectors of dimention N
+* output: List of Vectors of dimension N
 
 We do such transformation by the following ways. 
 
@@ -52,32 +52,32 @@ docs_emb = t2v.tfidf_weighted_wv()
 For a more detailed introduction of using Weighted Word Embeddings wrt. TFIDF, please read [here](https://github.com/crownpku/text2vec/blob/master/wv_wrt_tfidf.md).
 
 
-## Usage of Similarity Calculation (simical)
+## Usage of Similarity Calculation (SimiCal)
 
 For example, we want to calculate the similarity/distance between the first two sentences in the docs_emb we just computed.
 
 Note that cosine similarity is between 0-1 (1 is most similar while 0 is least similar).
-For the other similarity measurements the results are actually **distance** (the larget the less similar). It's better to calculate distance for all possible pairs and then rank.
+For the other similarity measurements the results are actually **distance** (the largest the less similar). It's better to calculate distance for all possible pairs and then rank.
 
 ```
 # Initialize
 import text2vec
-sc = text2vec.simical(docs_emb[0], docs_emb[1])
+sc = text2vec.SimiCal(docs_emb[0], docs_emb[1])
 
 # Use Cosine
-simi_cos = sc.Cosine()
+simi_cos = sc.cosine()
 
 # Use Euclidean
-simi_euc = sc.Euclidean()
+simi_euc = sc.euclidean()
 
 # Use Triangle's Area Similarity (TS)
-simi_ts = sc.Triangle()
+simi_ts = sc.triangle()
 
 # Use Sector's Area Similairity (SS)
-simi_ss = sc.Sector()
+simi_ss = sc.sector()
 
 # Use TS-SS
-simi_ts_ss = sc.TS_SS()
+simi_ts_ss = sc.ts_ss()
 ```
 
 ## Reference
@@ -87,6 +87,3 @@ https://radimrehurek.com/gensim/tut2.html
 https://github.com/sdimi/average-word2vec
 
 https://github.com/taki0112/Vector_Similarity
-
-
-
diff --git a/text2vec.py b/text2vec.py
@@ -1,77 +1,74 @@
+import math
+import numpy as np
 import spacy
+from gensim import models
 from gensim.corpora import Dictionary
-from gensim.models.tfidfmodel import TfidfModel
-from gensim import corpora, models, similarities
 from gensim.matutils import sparse2full
-import numpy as np
-import math
-
+from gensim.models.tfidfmodel import TfidfModel
 
 
-#text2vec methods
-class text2vec():
+# Text2Vec Class
+class Text2Vec:
     def __init__(self, doc_list):
-        #Initialize
+        # Initialize
         self.doc_list = doc_list
         self.nlp, self.docs, self.docs_dict = self._preprocess(self.doc_list)
-    
+
     # Functions to lemmatise docs
     def _keep_token(self, t):
-        return (t.is_alpha and 
-                not (t.is_space or t.is_punct or 
+        return (t.is_alpha and
+                not (t.is_space or t.is_punct or
                      t.is_stop or t.like_num))
-    def _lemmatize_doc(self, doc):
-        return [ t.lemma_ for t in doc if self._keep_token(t)]
 
+    def _lemmatize_doc(self, doc):
+        return [t.lemma_ for t in doc if self._keep_token(t)]
 
-    #Gensim to create a dictionary and filter out stop and infrequent words (lemmas).
+    # Gensim to create a dictionary and filter out stop and infrequent words (lemmas).
     def _get_docs_dict(self, docs):
         docs_dict = Dictionary(docs)
-        #CAREFUL: For small corpus please carefully modify the parameters for filter_extremes, or simply comment it out.
-        docs_dict.filter_extremes(no_below=5, no_above=0.2)
+        # CAREFUL: For small corpus please carefully modify the parameters for filter_extremes, or simply comment it out.
+        # docs_dict.filter_extremes(no_below=5, no_above=0.2)
         docs_dict.compactify()
         return docs_dict
 
     # Preprocess docs
     def _preprocess(self, doc_list):
-        #Load spacy model
-        nlp  = spacy.load('en')
-        #lemmatise docs
-        docs = [self._lemmatize_doc(nlp(doc)) for doc in doc_list] 
-        #Get docs dictionary
+        # Load spacy model
+        nlp = spacy.load('en_core_web_md')
+        # lemmatise docs
+        # docs = [self._lemmatize_doc(nlp(doc)) for doc in doc_list]
+        docs = [[nlp(doc).text] for doc in doc_list]
+        # Get docs dictionary
         docs_dict = self._get_docs_dict(docs)
         return nlp, docs, docs_dict
 
-
     # Gensim can again be used to create a bag-of-words representation of each document,
-    # build the TF-IDF model, 
+    # build the TF-IDF model,
     # and compute the TF-IDF vector for each document.
-    def _get_tfidf(self, docs, docs_dict):
-        docs_corpus = [docs_dict.doc2bow(doc) for doc in docs]
-        model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
-        docs_tfidf  = model_tfidf[docs_corpus]
-        docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])
+    def _get_tfidf(self):
+        docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
+        model_tfidf = TfidfModel(docs_corpus, id2word=self.docs_dict)
+        docs_tfidf = model_tfidf[docs_corpus]
+        docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_tfidf])
         return docs_vecs
 
-
-    #Get avg w2v for one document
+    # Get avg w2v for one document
     def _document_vector(self, doc, docs_dict, nlp):
         # remove out-of-vocabulary words
         doc_vector = [nlp(word).vector for word in doc if word in docs_dict.token2id]
         return np.mean(doc_vector, axis=0)
 
-
     # Get a TF-IDF weighted Glove vector summary for document list
     # Input: a list of documents, Output: Matrix of vector for all the documents
     def tfidf_weighted_wv(self):
-        #tf-idf
-        docs_vecs   = self._get_tfidf(self.docs, self.docs_dict)
+        # tf-idf
+        docs_vecs = self._get_tfidf()
 
-        #Load glove embedding vector for each TF-IDF term
+        # Load glove embedding vector for each TF-IDF term
         tfidf_emb_vecs = np.vstack([self.nlp(self.docs_dict[i]).vector for i in range(len(self.docs_dict))])
 
-        #To get a TF-IDF weighted Glove vector summary of each document, 
-        #we just need to matrix multiply docs_vecs with tfidf_emb_vecs
+        # To get a TF-IDF weighted Glove vector summary of each document,
+        # we just need to matrix multiply docs_vecs with tfidf_emb_vecs
         docs_emb = np.dot(docs_vecs, tfidf_emb_vecs)
 
         return docs_emb
@@ -85,79 +82,77 @@ def avg_wv(self):
     def get_tfidf(self):
         docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
         model_tfidf = TfidfModel(docs_corpus, id2word=self.docs_dict)
-        docs_tfidf  = model_tfidf[docs_corpus]
-        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_tfidf])
+        docs_tfidf = model_tfidf[docs_corpus]
+        docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_tfidf])
         return docs_vecs
 
-
     # Get Latent Semantic Indexing(LSI) vector for document list
     def get_lsi(self, num_topics=300):
         docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
         model_lsi = models.LsiModel(docs_corpus, num_topics, id2word=self.docs_dict)
-        docs_lsi  = model_lsi[docs_corpus]
-        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lsi])
+        docs_lsi = model_lsi[docs_corpus]
+        docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lsi])
         return docs_vecs
 
     # Get Random Projections(RP) vector for document list
     def get_rp(self):
         docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
         model_rp = models.RpModel(docs_corpus, id2word=self.docs_dict)
-        docs_rp  = model_rp[docs_corpus]
-        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_rp])
+        docs_rp = model_rp[docs_corpus]
+        docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_rp])
         return docs_vecs
 
     # Get Latent Dirichlet Allocation(LDA) vector for document list
     def get_lda(self, num_topics=100):
         docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
         model_lda = models.LdaModel(docs_corpus, num_topics, id2word=self.docs_dict)
-        docs_lda  = model_lda[docs_corpus]
-        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda])
+        docs_lda = model_lda[docs_corpus]
+        docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda])
         return docs_vecs
 
     # Get Hierarchical Dirichlet Process(HDP) vector for document list
     def get_hdp(self):
         docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
         model_hdp = models.HdpModel(docs_corpus, id2word=self.docs_dict)
-        docs_hdp  = model_hdp[docs_corpus]
-        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_hdp])
+        docs_hdp = model_hdp[docs_corpus]
+        docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_hdp])
         return docs_vecs
 
-
-
-#Similarity Calculation methods
-class simical():
+
+# Similarity calculation class
+class SimiCal:
     def __init__(self, vec1, vec2):
         self.vec1 = vec1
         self.vec2 = vec2
 
-    def _VectorSize(self, vec) :
-        return math.sqrt(sum(math.pow(v,2) for v in vec))
-
-    def _InnerProduct(self) :
-        return sum(v1*v2 for v1,v2 in zip(self.vec1,self.vec2))
-
-    def _Theta(self) :
-        return math.acos(self.Cosine()) + 10
-   
-    def _Magnitude_Difference(self) :
-        return abs(self._VectorSize(self.vec1) - self._VectorSize(self.vec2))
-    
-    def Euclidean(self) :
-        return math.sqrt(sum(math.pow((v1-v2),2) for v1,v2 in zip(self.vec1, self.vec2)))
-    
-    def Cosine(self) :
-        result = self._InnerProduct() / (self._VectorSize(self.vec1) * self._VectorSize(self.vec2))
+    def _vector_size(self, vec):
+        return math.sqrt(sum(math.pow(v, 2) for v in vec))
+
+    def _inner_product(self):
+        return sum(v1 * v2 for v1, v2 in zip(self.vec1, self.vec2))
+
+    def _theta(self):
+        return math.acos(self.cosine()) + 10
+
+    def _magnitude_difference(self):
+        return abs(self._vector_size(self.vec1) - self._vector_size(self.vec2))
+
+    def euclidean(self):
+        return math.sqrt(sum(math.pow((v1 - v2), 2) for v1, v2 in zip(self.vec1, self.vec2)))
+
+    def cosine(self):
+        result = self._inner_product() / (self._vector_size(self.vec1) * self._vector_size(self.vec2))
         return result
 
-    def Triangle(self) :
-        theta = math.radians(self._Theta())
-        return (self._VectorSize(self.vec1) * self._VectorSize(self.vec2) * math.sin(theta)) / 2
+    def triangle(self):
+        theta = math.radians(self._theta())
+        return (self._vector_size(self.vec1) * self._vector_size(self.vec2) * math.sin(theta)) / 2
 
-    def Sector(self) :
-        ED = self.Euclidean()
-        MD = self._Magnitude_Difference()
-        theta = self._Theta()
-        return math.pi * math.pow((ED+MD),2) * theta/360
+    def sector(self):
+        ed = self.euclidean()
+        md = self._magnitude_difference()
+        theta = self._theta()
+        return math.pi * math.pow((ed + md), 2) * theta / 360
 
-    def TS_SS(self) :
-        return self.Triangle() * self.Sector()
+    def ts_ss(self):
+        return self.triangle() * self.sector()
diff --git a/wv_wrt_tfidf.md b/wv_wrt_tfidf.md
@@ -72,7 +72,7 @@ To wrap-up, here is the part of code in text2vec.py:
 ```
 def tfidf_weighted_wv(self):
     #tf-idf
-    docs_vecs   = self._get_tfidf(self.docs, self.docs_dict)
+    docs_vecs   = self._get_tfidf()
 
     #Load glove embedding vector for each TF-IDF term
     tfidf_emb_vecs = np.vstack([self.nlp(self.docs_dict[i]).vector for i in range(len(self.docs_dict))])