diff --git a/README.md b/README.md index 013659d..4cd2967 100644 --- a/README.md +++ b/README.md @@ -19,10 +19,10 @@ import text2vec * input: List of Documents, doc_list is a list of documents/paragraphs/sentences. ``` -t2v = text2vec.text2vec(doc_list) +t2v = text2vec.Text2Vec(doc_list) ``` -* output: List of Vectors of dimention N +* output: List of Vectors of dimension N We do such transformation by the following ways. @@ -52,32 +52,32 @@ docs_emb = t2v.tfidf_weighted_wv() For a more detailed introduction of using Weighted Word Embeddings wrt. TFIDF, please read [here](https://github.com/crownpku/text2vec/blob/master/wv_wrt_tfidf.md). -## Usage of Similarity Calculation (simical) +## Usage of Similarity Calculation (SimiCal) For example, we want to calculate the similarity/distance between the first two sentences in the docs_emb we just computed. Note that cosine similarity is between 0-1 (1 is most similar while 0 is least similar). -For the other similarity measurements the results are actually **distance** (the larget the less similar). It's better to calculate distance for all possible pairs and then rank. +For the other similarity measurements the results are actually **distance** (the largest the less similar). It's better to calculate distance for all possible pairs and then rank. ``` # Initialize import text2vec -sc = text2vec.simical(docs_emb[0], docs_emb[1]) +sc = text2vec.SimiCal(docs_emb[0], docs_emb[1]) # Use Cosine -simi_cos = sc.Cosine() +simi_cos = sc.cosine() # Use Euclidean -simi_euc = sc.Euclidean() +simi_euc = sc.euclidean() # Use Triangle's Area Similarity (TS) -simi_ts = sc.Triangle() +simi_ts = sc.triangle() # Use Sector's Area Similairity (SS) -simi_ss = sc.Sector() +simi_ss = sc.sector() # Use TS-SS -simi_ts_ss = sc.TS_SS() +simi_ts_ss = sc.ts_ss() ``` ## Reference @@ -87,6 +87,3 @@ https://radimrehurek.com/gensim/tut2.html https://github.com/sdimi/average-word2vec https://github.com/taki0112/Vector_Similarity - - - diff --git a/text2vec.py b/text2vec.py index 0b9d82c..ddf6fd6 100644 --- a/text2vec.py +++ b/text2vec.py @@ -1,77 +1,74 @@ +import math +import numpy as np import spacy +from gensim import models from gensim.corpora import Dictionary -from gensim.models.tfidfmodel import TfidfModel -from gensim import corpora, models, similarities from gensim.matutils import sparse2full -import numpy as np -import math - +from gensim.models.tfidfmodel import TfidfModel -#text2vec methods -class text2vec(): +# Text2Vec Class +class Text2Vec: def __init__(self, doc_list): - #Initialize + # Initialize self.doc_list = doc_list self.nlp, self.docs, self.docs_dict = self._preprocess(self.doc_list) - + # Functions to lemmatise docs def _keep_token(self, t): - return (t.is_alpha and - not (t.is_space or t.is_punct or + return (t.is_alpha and + not (t.is_space or t.is_punct or t.is_stop or t.like_num)) - def _lemmatize_doc(self, doc): - return [ t.lemma_ for t in doc if self._keep_token(t)] + def _lemmatize_doc(self, doc): + return [t.lemma_ for t in doc if self._keep_token(t)] - #Gensim to create a dictionary and filter out stop and infrequent words (lemmas). + # Gensim to create a dictionary and filter out stop and infrequent words (lemmas). def _get_docs_dict(self, docs): docs_dict = Dictionary(docs) - #CAREFUL: For small corpus please carefully modify the parameters for filter_extremes, or simply comment it out. - docs_dict.filter_extremes(no_below=5, no_above=0.2) + # CAREFUL: For small corpus please carefully modify the parameters for filter_extremes, or simply comment it out. + # docs_dict.filter_extremes(no_below=5, no_above=0.2) docs_dict.compactify() return docs_dict # Preprocess docs def _preprocess(self, doc_list): - #Load spacy model - nlp = spacy.load('en') - #lemmatise docs - docs = [self._lemmatize_doc(nlp(doc)) for doc in doc_list] - #Get docs dictionary + # Load spacy model + nlp = spacy.load('en_core_web_md') + # lemmatise docs + # docs = [self._lemmatize_doc(nlp(doc)) for doc in doc_list] + docs = [[nlp(doc).text] for doc in doc_list] + # Get docs dictionary docs_dict = self._get_docs_dict(docs) return nlp, docs, docs_dict - # Gensim can again be used to create a bag-of-words representation of each document, - # build the TF-IDF model, + # build the TF-IDF model, # and compute the TF-IDF vector for each document. - def _get_tfidf(self, docs, docs_dict): - docs_corpus = [docs_dict.doc2bow(doc) for doc in docs] - model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict) - docs_tfidf = model_tfidf[docs_corpus] - docs_vecs = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf]) + def _get_tfidf(self): + docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] + model_tfidf = TfidfModel(docs_corpus, id2word=self.docs_dict) + docs_tfidf = model_tfidf[docs_corpus] + docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_tfidf]) return docs_vecs - - #Get avg w2v for one document + # Get avg w2v for one document def _document_vector(self, doc, docs_dict, nlp): # remove out-of-vocabulary words doc_vector = [nlp(word).vector for word in doc if word in docs_dict.token2id] return np.mean(doc_vector, axis=0) - # Get a TF-IDF weighted Glove vector summary for document list # Input: a list of documents, Output: Matrix of vector for all the documents def tfidf_weighted_wv(self): - #tf-idf - docs_vecs = self._get_tfidf(self.docs, self.docs_dict) + # tf-idf + docs_vecs = self._get_tfidf() - #Load glove embedding vector for each TF-IDF term + # Load glove embedding vector for each TF-IDF term tfidf_emb_vecs = np.vstack([self.nlp(self.docs_dict[i]).vector for i in range(len(self.docs_dict))]) - #To get a TF-IDF weighted Glove vector summary of each document, - #we just need to matrix multiply docs_vecs with tfidf_emb_vecs + # To get a TF-IDF weighted Glove vector summary of each document, + # we just need to matrix multiply docs_vecs with tfidf_emb_vecs docs_emb = np.dot(docs_vecs, tfidf_emb_vecs) return docs_emb @@ -85,79 +82,77 @@ def avg_wv(self): def get_tfidf(self): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_tfidf = TfidfModel(docs_corpus, id2word=self.docs_dict) - docs_tfidf = model_tfidf[docs_corpus] - docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_tfidf]) + docs_tfidf = model_tfidf[docs_corpus] + docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_tfidf]) return docs_vecs - # Get Latent Semantic Indexing(LSI) vector for document list def get_lsi(self, num_topics=300): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_lsi = models.LsiModel(docs_corpus, num_topics, id2word=self.docs_dict) - docs_lsi = model_lsi[docs_corpus] - docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lsi]) + docs_lsi = model_lsi[docs_corpus] + docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lsi]) return docs_vecs # Get Random Projections(RP) vector for document list def get_rp(self): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_rp = models.RpModel(docs_corpus, id2word=self.docs_dict) - docs_rp = model_rp[docs_corpus] - docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_rp]) + docs_rp = model_rp[docs_corpus] + docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_rp]) return docs_vecs # Get Latent Dirichlet Allocation(LDA) vector for document list def get_lda(self, num_topics=100): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_lda = models.LdaModel(docs_corpus, num_topics, id2word=self.docs_dict) - docs_lda = model_lda[docs_corpus] - docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda]) + docs_lda = model_lda[docs_corpus] + docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda]) return docs_vecs # Get Hierarchical Dirichlet Process(HDP) vector for document list def get_hdp(self): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_hdp = models.HdpModel(docs_corpus, id2word=self.docs_dict) - docs_hdp = model_hdp[docs_corpus] - docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_hdp]) + docs_hdp = model_hdp[docs_corpus] + docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_hdp]) return docs_vecs - - -#Similarity Calculation methods -class simical(): + +# Similarity calculation class +class SimiCal: def __init__(self, vec1, vec2): self.vec1 = vec1 self.vec2 = vec2 - def _VectorSize(self, vec) : - return math.sqrt(sum(math.pow(v,2) for v in vec)) - - def _InnerProduct(self) : - return sum(v1*v2 for v1,v2 in zip(self.vec1,self.vec2)) - - def _Theta(self) : - return math.acos(self.Cosine()) + 10 - - def _Magnitude_Difference(self) : - return abs(self._VectorSize(self.vec1) - self._VectorSize(self.vec2)) - - def Euclidean(self) : - return math.sqrt(sum(math.pow((v1-v2),2) for v1,v2 in zip(self.vec1, self.vec2))) - - def Cosine(self) : - result = self._InnerProduct() / (self._VectorSize(self.vec1) * self._VectorSize(self.vec2)) + def _vector_size(self, vec): + return math.sqrt(sum(math.pow(v, 2) for v in vec)) + + def _inner_product(self): + return sum(v1 * v2 for v1, v2 in zip(self.vec1, self.vec2)) + + def _theta(self): + return math.acos(self.cosine()) + 10 + + def _magnitude_difference(self): + return abs(self._vector_size(self.vec1) - self._vector_size(self.vec2)) + + def euclidean(self): + return math.sqrt(sum(math.pow((v1 - v2), 2) for v1, v2 in zip(self.vec1, self.vec2))) + + def cosine(self): + result = self._inner_product() / (self._vector_size(self.vec1) * self._vector_size(self.vec2)) return result - def Triangle(self) : - theta = math.radians(self._Theta()) - return (self._VectorSize(self.vec1) * self._VectorSize(self.vec2) * math.sin(theta)) / 2 + def triangle(self): + theta = math.radians(self._theta()) + return (self._vector_size(self.vec1) * self._vector_size(self.vec2) * math.sin(theta)) / 2 - def Sector(self) : - ED = self.Euclidean() - MD = self._Magnitude_Difference() - theta = self._Theta() - return math.pi * math.pow((ED+MD),2) * theta/360 + def sector(self): + ed = self.euclidean() + md = self._magnitude_difference() + theta = self._theta() + return math.pi * math.pow((ed + md), 2) * theta / 360 - def TS_SS(self) : - return self.Triangle() * self.Sector() + def ts_ss(self): + return self.triangle() * self.sector() diff --git a/wv_wrt_tfidf.md b/wv_wrt_tfidf.md index 9c9d13c..c1ebe21 100644 --- a/wv_wrt_tfidf.md +++ b/wv_wrt_tfidf.md @@ -72,7 +72,7 @@ To wrap-up, here is the part of code in text2vec.py: ``` def tfidf_weighted_wv(self): #tf-idf - docs_vecs = self._get_tfidf(self.docs, self.docs_dict) + docs_vecs = self._get_tfidf() #Load glove embedding vector for each TF-IDF term tfidf_emb_vecs = np.vstack([self.nlp(self.docs_dict[i]).vector for i in range(len(self.docs_dict))])