Let Simmer, Unsupervised#

%load_ext coconut
import re
import numpy as np
rng = np.random.default_rng(2)
import pandas as pd
import janitor as pj
import matplotlib.pyplot as plt
import seaborn as sns
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.models import HoverTool
# from holoviews.operation.datashader import datashade, dynspread, rasterize
# import datashader as ds


# tooltips = [
#     ('Text', '@text'),
#     ('Flavor', '@flavor_text'),
#     ('Color ID','@color_identity'),
#     ('Cluster', '@cluster')
# ]
color_mapping = {'W': 'tan', 'U': 'blue', 'B': 'purple', 'R': 'red', 'G': 'green', 'M': 'goldenrod'}
mana_color_dim = hv.dim('color_identity').categorize(color_mapping, default='grey')
fullwidth=dict(height=450, width=900)
# hover = HoverTool(tooltips=tooltips)
opts.defaults(opts.Scatter(tools=['hover'], size=8, **fullwidth), 
              opts.Points(tools=['hover'], size=8, **fullwidth))
%%coconut 
from tlp.data import DataLoader

df = (
    DataLoader.mtg()
    .dropna(subset=['flavor_text', 'text'])
#     .fillna(value={'color_identity':'NA'})
    .transform_column('color_identity', s->''.join(set(s)),elementwise=True)
    .replace(to_replace={'color_identity':re.compile('\w{2,}')}, value='M')
)
text = df.text.str.cat(others=[df.name, df.flavor_text], sep='\n')#.fillna('')
df.head()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File /tmp/ipykernel_103979/1684363309.py:28
     24 # Compiled Coconut: -----------------------------------------------------------
     26 from tlp.data import DataLoader  #1: from tlp.data import DataLoader
---> 28 df = (DataLoader.mtg().dropna(subset=['flavor_text', 'text']).transform_column('color_identity', lambda s: ''.join(set(s)), elementwise=True).replace(to_replace={'color_identity': re.compile('\w{2,}')}, value='M'))  #3: df = (
     29 text = df.text.str.cat(others=[df.name, df.flavor_text], sep='\n')  #.fillna('')  #10: text = df.text.str.cat(others=[df.name, df.flavor_text], sep='\n')#.fillna('')
     30 df.head()  #11: df.head()

File ~/Sync/code/tlp/text-as-data/tlp/data/mtg.py:147, in open_mtg()
    144 def open_mtg() -> pd.DataFrame:
    145     from dvc import api
--> 147     with api.open(Path(MTG_DATA_PATH)/'mtg.feather') as mtg_feather:   
    149         return pd.read_feather(mtg_feather)

File ~/.pyenv/versions/3.10.9/lib/python3.10/contextlib.py:135, in _GeneratorContextManager.__enter__(self)
    133 del self.args, self.kwds, self.func
    134 try:
--> 135     return next(self.gen)
    136 except StopIteration:
    137     raise RuntimeError("generator didn't yield") from None

File ~/.pyenv/versions/text-data/lib/python3.10/site-packages/dvc/api/data.py:262, in _open(path, repo, rev, remote, mode, encoding, config, remote_config)
    260 else:
    261     fs = DVCFileSystem(repo=_repo, subrepos=True)
--> 262     fs_path = fs.from_os_path(path)
    264 try:
    265     with fs.open(
    266         fs_path,
    267         mode=mode,
    268         encoding=encoding,
    269     ) as fobj:

File ~/.pyenv/versions/text-data/lib/python3.10/site-packages/dvc/fs/dvc.py:437, in DVCFileSystem.from_os_path(self, path)
    434 if os.path.isabs(path):
    435     path = os.path.relpath(path, self.repo.root_dir)
--> 437 return as_posix(path)

File ~/.pyenv/versions/text-data/lib/python3.10/site-packages/dvc/fs/dvc.py:30, in as_posix(path)
     29 def as_posix(path: str) -> str:
---> 30     return path.replace(ntpath.sep, posixpath.sep)

TypeError: Path.replace() takes 2 positional arguments but 3 were given
# from syntok.tokenizer import Tokenizer

# tok = Tokenizer()  # optional: keep "n't" contractions and "-", "_" inside words as tokens
# text.apply(list..(tok.tokenize))

import re
tokenize = re.compile(
    r'(?:\#[\w\d]+\b)'
    r'|(?:\b\w[\/\&]\w)\b'
    r'|(?:\b\w[\w\'\d]+)\b'
    r'|(?:\{\w\})'  # mana
    r'|(?:[+-]\d\d?(?:/[+-]\d\d?)?)'  # tokens
)

text.str.findall(tokenize).explode().unique()[:100]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 15
      6 import re
      7 tokenize = re.compile(
      8     r'(?:\#[\w\d]+\b)'
      9     r'|(?:\b\w[\/\&]\w)\b'
   (...)
     12     r'|(?:[+-]\d\d?(?:/[+-]\d\d?)?)'  # tokens
     13 )
---> 15 text.str.findall(tokenize).explode().unique()[:100]

NameError: name 'text' is not defined

Visualize#

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
import umap#, umap.plot
from sklearn.decomposition import TruncatedSVD

vsm = TfidfVectorizer(
    tokenizer=tokenize.findall,
    min_df=3, 
    max_df=0.8,
    stop_words='english',
    ngram_range=(1,2),
)  # vector-space model
manifold = Pipeline([
    ('pca',TruncatedSVD(n_components=100)),
    ('tsne', TSNE(learning_rate='auto', init='random'))
])
# manifold = umap.UMAP(n_components=2, metric='cosine', n_neighbors=15)

sample = rng.choice([True, False], size=len(text), p=[0.2, 0.8])
X = vsm.fit_transform(text)
X_2d = manifold.fit_transform(X[sample,:])

X_2d.shape
# df_samp = 

key_dimensions = [('x', 'Dim-1'), ('y', 'Dim-2')]
value_dimensions = [
    ('text', 'Rules Text'), 
    ('flavor_text', 'Flavor Text'), 
    ('color_identity', 'Color ID'),
]
macro = hv.Table(df[sample].assign(x=X_2d[:,0], y=X_2d[:,1]), key_dimensions, value_dimensions)
macro.to.points(['x','y'])
%%coconut
scatter = (macro.to.points(['x','y'], groupby='color_identity')
 .overlay()
#  .opts(**fullwidth)
#  |> datashade$(aggregator=ds.by('color_identity', ds.count()))
#  |> dynspread
 |> .opts(opts.Points(color=mana_color_dim, tools=['hover', 'lasso_select'], **fullwidth))
)
scatter
macro.to.points(['x','y'], groupby='color_identity').opts(opts.Points(width=600, color=mana_color_dim))

Keywords#

pd.Series(
    np.asarray(X.sum(axis=0))[0], 
    index=vsm.get_feature_names_out()
).sort_values(ascending=False).head(40)

Clustering#

%%coconut
import hdbscan
clust = hdbscan.HDBSCAN(
#     min_samples=1,
#     min_cluster_size=30,
    metric='cosine',
)
highD_labels = clust.fit_predict(X[sample,:]) |> pd.Series
clust.condensed_tree_.plot()
highD_labels.value_counts()
# scatt2 = hv.Scatter(df_samp.assign(cluster=highD_labels.values),'x', 
#            vdims=['y', 'text', 'flavor_text', 'cluster']).opts(color='cluster', cmap='glasbey', size=5)
# scatt2
macro.add_dimension('cluster', 0, highD_labels.values).to.points(['x','y'], groupby='cluster').overlay()
## from docs: https://umap-learn.readthedocs.io/en/latest/clustering.html
# clusterable_embedding = umap.UMAP(
#     n_neighbors=40,
#     min_dist=0.1,
#     n_components=2,
#     random_state=42,
# ).fit_transform(vsm.transform(text[sample]))
%%coconut
clust =hdbscan.HDBSCAN(
#     min_samples=10,
    min_cluster_size=10,
    cluster_selection_epsilon=2,
)
labels=clust.fit_predict(X_2d)|> pd.Series #|> .replace(-1, None)
clust.condensed_tree_.plot()
labels.value_counts()
# hdbscan.all_points_membership_vectors(clust)

macro.add_dimension('cluster', 0, labels.values).to.points(['x','y'], groupby='cluster').overlay()

Cheating?#

%%coconut
from sklearn.preprocessing import LabelEncoder

class_encode = LabelEncoder()
targets = class_encode.fit_transform(df.loc[sample, 'color_identity'].values)
metric_learn = umap.UMAP(target_metric='categorical', target_weight=0.3)

M_2d = (
    text[sample] 
    |> vsm.transform 
    |> metric_learn.fit_transform$(y=targets)
)
new_macro = (
    macro
    .add_dimension('x-class',0, M_2d[:,0])
    .add_dimension('y-class',1, M_2d[:,1])
)
(new_macro.to.points(['x-class','y-class'], groupby='color_identity')
 .overlay().opts(opts.Scatter(cmap=color_mapping))
#  |> datashade$(aggregator=ds.by('color_identity', ds.count()))
#  |> dynspread
#  |> .opts(**fullwidth)
)

Topic Modeling#

Latent Semantic Indexing (i.e. SVD)#

def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

    
plot_top_words(
    TruncatedSVD(n_components=10).fit(X), vsm.get_feature_names_out(), 10, "Topics (Frobenius norm)"
)

Non-Negative Matrix Factorization (NMF)#

from sklearn.decomposition import NMF, LatentDirichletAllocation

topics = NMF(n_components=10)
topics.fit_transform(X)
plot_top_words(
    topics, vsm.get_feature_names_out(), 10, "Topics (Frobenius norm)"
)

Latent Dirichlet Allocation#

topics = LatentDirichletAllocation(n_components=30)
topics.fit_transform(X)
    
plot_top_words(
    topics, vsm.get_feature_names_out(), 10, "Topics (Frobenius norm)"
)