Vector Semantics

Vector Semantics#

import spacy
import numpy as np
from scipy.spatial import distance
# from gensim.models import Word2Vec
import nltk
import pandas as pd
import flair

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 7
      5 import nltk
      6 import pandas as pd
----> 7 import flair

ModuleNotFoundError: No module named 'flair'

# !python -m spacy download en_core_web_md
# nlp=spacy.load('en_core_web_md')
from flair.embeddings import WordEmbeddings
from flair.data import Sentence

# init embedding
glove_embedding = WordEmbeddings('glove')

# create sentence.
sentence = Sentence('The grass is green .')

# embed a sentence using glove.
glove_embedding.embed(sentence)

# now check out the embedded tokens.
for token in sentence:
    print(token)
    print(token.embedding)

def vec(s:str):
    return nlp(s).vector
vec('lion')

your_word = "lion"

ms = nlp.vocab.vectors.most_similar(
    np.asarray([nlp.vocab.vectors[nlp.vocab.strings[your_word]]]), n=10)
words = [nlp.vocab.strings[w] for w in ms[0][0]]
distances = ms[2]
print(words)

# Format the vocabulary for use in the distance function
words = pd.Series(list(nlp.vocab.vectors.strings)).str.lower().pipe(
    lambda s: s[s.str.fullmatch('[a-z]+')]
).unique()
vocab_vectors = np.array([nlp.vocab.strings[nlp.vocab.vectors[x]] for x in words])
## use nltk

vocab_vectors@vocab_vectors.T

# input_word = "frog"
# p = np.array([nlp.vocab[input_word].vector])
p = np.array([vec('king')+vec('queen')-vec('man')])
# *** Find the closest word below ***
closest_index = distance.cdist(p, vocab_vectors).argmin()
word_id = list(nlp.vocab.vectors.keys())[closest_index]
nlp.vocab[word_id].text
# output_word is identical, or very close, to the input word

np.ndarray.argsort