Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 10 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ import text2vec

* input: List of Documents, doc_list is a list of documents/paragraphs/sentences.
```
t2v = text2vec.text2vec(doc_list)
t2v = text2vec.Text2Vec(doc_list)
```

* output: List of Vectors of dimention N
* output: List of Vectors of dimension N

We do such transformation by the following ways.

Expand Down Expand Up @@ -52,32 +52,32 @@ docs_emb = t2v.tfidf_weighted_wv()
For a more detailed introduction of using Weighted Word Embeddings wrt. TFIDF, please read [here](https://github.com/crownpku/text2vec/blob/master/wv_wrt_tfidf.md).


## Usage of Similarity Calculation (simical)
## Usage of Similarity Calculation (SimiCal)

For example, we want to calculate the similarity/distance between the first two sentences in the docs_emb we just computed.

Note that cosine similarity is between 0-1 (1 is most similar while 0 is least similar).
For the other similarity measurements the results are actually **distance** (the larget the less similar). It's better to calculate distance for all possible pairs and then rank.
For the other similarity measurements the results are actually **distance** (the largest the less similar). It's better to calculate distance for all possible pairs and then rank.

```
# Initialize
import text2vec
sc = text2vec.simical(docs_emb[0], docs_emb[1])
sc = text2vec.SimiCal(docs_emb[0], docs_emb[1])

# Use Cosine
simi_cos = sc.Cosine()
simi_cos = sc.cosine()

# Use Euclidean
simi_euc = sc.Euclidean()
simi_euc = sc.euclidean()

# Use Triangle's Area Similarity (TS)
simi_ts = sc.Triangle()
simi_ts = sc.triangle()

# Use Sector's Area Similairity (SS)
simi_ss = sc.Sector()
simi_ss = sc.sector()

# Use TS-SS
simi_ts_ss = sc.TS_SS()
simi_ts_ss = sc.ts_ss()
```

## Reference
Expand All @@ -87,6 +87,3 @@ https://radimrehurek.com/gensim/tut2.html
https://github.com/sdimi/average-word2vec

https://github.com/taki0112/Vector_Similarity



151 changes: 73 additions & 78 deletions text2vec.py
Original file line number Diff line number Diff line change
@@ -1,77 +1,74 @@
import math
import numpy as np
import spacy
from gensim import models
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim import corpora, models, similarities
from gensim.matutils import sparse2full
import numpy as np
import math

from gensim.models.tfidfmodel import TfidfModel


#text2vec methods
class text2vec():
# Text2Vec Class
class Text2Vec:
def __init__(self, doc_list):
#Initialize
# Initialize
self.doc_list = doc_list
self.nlp, self.docs, self.docs_dict = self._preprocess(self.doc_list)

# Functions to lemmatise docs
def _keep_token(self, t):
return (t.is_alpha and
not (t.is_space or t.is_punct or
return (t.is_alpha and
not (t.is_space or t.is_punct or
t.is_stop or t.like_num))
def _lemmatize_doc(self, doc):
return [ t.lemma_ for t in doc if self._keep_token(t)]

def _lemmatize_doc(self, doc):
return [t.lemma_ for t in doc if self._keep_token(t)]

#Gensim to create a dictionary and filter out stop and infrequent words (lemmas).
# Gensim to create a dictionary and filter out stop and infrequent words (lemmas).
def _get_docs_dict(self, docs):
docs_dict = Dictionary(docs)
#CAREFUL: For small corpus please carefully modify the parameters for filter_extremes, or simply comment it out.
docs_dict.filter_extremes(no_below=5, no_above=0.2)
# CAREFUL: For small corpus please carefully modify the parameters for filter_extremes, or simply comment it out.
# docs_dict.filter_extremes(no_below=5, no_above=0.2)
docs_dict.compactify()
return docs_dict

# Preprocess docs
def _preprocess(self, doc_list):
#Load spacy model
nlp = spacy.load('en')
#lemmatise docs
docs = [self._lemmatize_doc(nlp(doc)) for doc in doc_list]
#Get docs dictionary
# Load spacy model
nlp = spacy.load('en_core_web_md')
# lemmatise docs
# docs = [self._lemmatize_doc(nlp(doc)) for doc in doc_list]
docs = [[nlp(doc).text] for doc in doc_list]
# Get docs dictionary
docs_dict = self._get_docs_dict(docs)
return nlp, docs, docs_dict


# Gensim can again be used to create a bag-of-words representation of each document,
# build the TF-IDF model,
# build the TF-IDF model,
# and compute the TF-IDF vector for each document.
def _get_tfidf(self, docs, docs_dict):
docs_corpus = [docs_dict.doc2bow(doc) for doc in docs]
model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
docs_tfidf = model_tfidf[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])
def _get_tfidf(self):
docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
model_tfidf = TfidfModel(docs_corpus, id2word=self.docs_dict)
docs_tfidf = model_tfidf[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_tfidf])
return docs_vecs


#Get avg w2v for one document
# Get avg w2v for one document
def _document_vector(self, doc, docs_dict, nlp):
# remove out-of-vocabulary words
doc_vector = [nlp(word).vector for word in doc if word in docs_dict.token2id]
return np.mean(doc_vector, axis=0)


# Get a TF-IDF weighted Glove vector summary for document list
# Input: a list of documents, Output: Matrix of vector for all the documents
def tfidf_weighted_wv(self):
#tf-idf
docs_vecs = self._get_tfidf(self.docs, self.docs_dict)
# tf-idf
docs_vecs = self._get_tfidf()

#Load glove embedding vector for each TF-IDF term
# Load glove embedding vector for each TF-IDF term
tfidf_emb_vecs = np.vstack([self.nlp(self.docs_dict[i]).vector for i in range(len(self.docs_dict))])

#To get a TF-IDF weighted Glove vector summary of each document,
#we just need to matrix multiply docs_vecs with tfidf_emb_vecs
# To get a TF-IDF weighted Glove vector summary of each document,
# we just need to matrix multiply docs_vecs with tfidf_emb_vecs
docs_emb = np.dot(docs_vecs, tfidf_emb_vecs)

return docs_emb
Expand All @@ -85,79 +82,77 @@ def avg_wv(self):
def get_tfidf(self):
docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
model_tfidf = TfidfModel(docs_corpus, id2word=self.docs_dict)
docs_tfidf = model_tfidf[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_tfidf])
docs_tfidf = model_tfidf[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_tfidf])
return docs_vecs


# Get Latent Semantic Indexing(LSI) vector for document list
def get_lsi(self, num_topics=300):
docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
model_lsi = models.LsiModel(docs_corpus, num_topics, id2word=self.docs_dict)
docs_lsi = model_lsi[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lsi])
docs_lsi = model_lsi[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lsi])
return docs_vecs

# Get Random Projections(RP) vector for document list
def get_rp(self):
docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
model_rp = models.RpModel(docs_corpus, id2word=self.docs_dict)
docs_rp = model_rp[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_rp])
docs_rp = model_rp[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_rp])
return docs_vecs

# Get Latent Dirichlet Allocation(LDA) vector for document list
def get_lda(self, num_topics=100):
docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
model_lda = models.LdaModel(docs_corpus, num_topics, id2word=self.docs_dict)
docs_lda = model_lda[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda])
docs_lda = model_lda[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda])
return docs_vecs

# Get Hierarchical Dirichlet Process(HDP) vector for document list
def get_hdp(self):
docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
model_hdp = models.HdpModel(docs_corpus, id2word=self.docs_dict)
docs_hdp = model_hdp[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_hdp])
docs_hdp = model_hdp[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_hdp])
return docs_vecs



#Similarity Calculation methods
class simical():

# Similarity calculation class
class SimiCal:
def __init__(self, vec1, vec2):
self.vec1 = vec1
self.vec2 = vec2

def _VectorSize(self, vec) :
return math.sqrt(sum(math.pow(v,2) for v in vec))

def _InnerProduct(self) :
return sum(v1*v2 for v1,v2 in zip(self.vec1,self.vec2))

def _Theta(self) :
return math.acos(self.Cosine()) + 10
def _Magnitude_Difference(self) :
return abs(self._VectorSize(self.vec1) - self._VectorSize(self.vec2))
def Euclidean(self) :
return math.sqrt(sum(math.pow((v1-v2),2) for v1,v2 in zip(self.vec1, self.vec2)))
def Cosine(self) :
result = self._InnerProduct() / (self._VectorSize(self.vec1) * self._VectorSize(self.vec2))
def _vector_size(self, vec):
return math.sqrt(sum(math.pow(v, 2) for v in vec))

def _inner_product(self):
return sum(v1 * v2 for v1, v2 in zip(self.vec1, self.vec2))

def _theta(self):
return math.acos(self.cosine()) + 10

def _magnitude_difference(self):
return abs(self._vector_size(self.vec1) - self._vector_size(self.vec2))

def euclidean(self):
return math.sqrt(sum(math.pow((v1 - v2), 2) for v1, v2 in zip(self.vec1, self.vec2)))

def cosine(self):
result = self._inner_product() / (self._vector_size(self.vec1) * self._vector_size(self.vec2))
return result

def Triangle(self) :
theta = math.radians(self._Theta())
return (self._VectorSize(self.vec1) * self._VectorSize(self.vec2) * math.sin(theta)) / 2
def triangle(self):
theta = math.radians(self._theta())
return (self._vector_size(self.vec1) * self._vector_size(self.vec2) * math.sin(theta)) / 2

def Sector(self) :
ED = self.Euclidean()
MD = self._Magnitude_Difference()
theta = self._Theta()
return math.pi * math.pow((ED+MD),2) * theta/360
def sector(self):
ed = self.euclidean()
md = self._magnitude_difference()
theta = self._theta()
return math.pi * math.pow((ed + md), 2) * theta / 360

def TS_SS(self) :
return self.Triangle() * self.Sector()
def ts_ss(self):
return self.triangle() * self.sector()
2 changes: 1 addition & 1 deletion wv_wrt_tfidf.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ To wrap-up, here is the part of code in text2vec.py:
```
def tfidf_weighted_wv(self):
#tf-idf
docs_vecs = self._get_tfidf(self.docs, self.docs_dict)
docs_vecs = self._get_tfidf()

#Load glove embedding vector for each TF-IDF term
tfidf_emb_vecs = np.vstack([self.nlp(self.docs_dict[i]).vector for i in range(len(self.docs_dict))])
Expand Down