models
visualize_utils.py
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from bokeh.io import export_png, output_notebook, show
from bokeh.plotting import figure
from bokeh.models import Plot, Range1d, MultiLine, Circle, HoverTool, TapTool, BoxSelectTool, LinearColorMapper, ColumnDataSource, LabelSet, SaveTool, ColorBar, BasicTicker
from bokeh.models.graphs import from_networkx, NodesAndLinkedEdges, EdgesAndLinkedNodes
from bokeh.palettes import Spectral8
def visualize_sentences(vecs, sentences, palette="Viridis256", filename="/notebooks/embedding/sentences.png",
use_notebook=False):
tsne = TSNE(n_components=2)
tsne_results = tsne.fit_transform(vecs)
df = pd.DataFrame(columns=['x', 'y', 'sentence'])
df['x'], df['y'], df['sentence'] = tsne_results[:, 0], tsne_results[:, 1], sentences
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="sentence", y_offset=8,
text_font_size="12pt", text_color="#555555",
source=source, text_align='center')
color_mapper = LinearColorMapper(palette=palette, low=min(tsne_results[:, 1]), high=max(tsne_results[:, 1]))
plot = figure(plot_width=900, plot_height=900)
plot.scatter("x", "y", size=12, source=source, color={'field': 'y', 'transform': color_mapper}, line_color=None, fill_alpha=0.8)
plot.add_layout(labels)
if use_notebook:
output_notebook()
show(plot)
else:
export_png(plot, filename)
print("save @ " + filename)
"""
Visualize homonyms (2d vector space)
Inspired by:
https://github.com/hengluchang/visualizing_contextual_vectors/blob/master/elmo_vis.py
"""
def visualize_homonym(homonym, tokenized_sentences, vecs, model_name, palette="Viridis256",
filename="/notebooks/embedding/homonym.png", use_notebook=False):
# process sentences
token_list, processed_sentences = [], []
for tokens in tokenized_sentences:
token_list.extend(tokens)
sentence = []
for token in tokens:
if model_name == "bert":
processed_token = token.replace("##", "")
else:
processed_token = token
if token == homonym:
processed_token = "\"" + processed_token + "\""
sentence.append(processed_token)
processed_sentences.append(' '.join(sentence))
# dimension reduction
tsne = TSNE(n_components=2)
tsne_results = tsne.fit_transform(vecs[1:])
# only plot the word representation of interest
interest_vecs, idx = np.zeros((len(tokenized_sentences), 2)), 0
for word, vec in zip(token_list, tsne_results):
if word == homonym:
interest_vecs[idx] = vec
idx += 1
df = pd.DataFrame(columns=['x', 'y', 'annotation'])
df['x'], df['y'], df['annotation'] = interest_vecs[:, 0], interest_vecs[:, 1], processed_sentences
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="annotation", y_offset=8,
text_font_size="12pt", text_color="#555555",
source=source, text_align='center')
color_mapper = LinearColorMapper(palette=palette, low=min(tsne_results[:, 1]), high=max(tsne_results[:, 1]))
plot = figure(plot_width=900, plot_height=900)
plot.scatter("x", "y", size=12, source=source, color={'field': 'y', 'transform': color_mapper},
line_color=None,
fill_alpha=0.8)
plot.add_layout(labels)
if use_notebook:
output_notebook()
show(plot)
else:
export_png(plot, filename)
print("save @ " + filename)
def visualize_between_sentences(sentences, vec_list, palette="Viridis256",
filename="/notebooks/embedding/between-sentences.png",
use_notebook=False):
df_list, score_list = [], []
for sent1_idx, sentence1 in enumerate(sentences):
for sent2_idx, sentence2 in enumerate(sentences):
vec1, vec2 = vec_list[sent1_idx], vec_list[sent2_idx]
if np.any(vec1) and np.any(vec2):
score = cosine_similarity(X=[vec1], Y=[vec2])
df_list.append({'x': sentence1, 'y': sentence2, 'similarity': score[0][0]})
score_list.append(score[0][0])
df = pd.DataFrame(df_list)
color_mapper = LinearColorMapper(palette=palette, low=np.max(score_list), high=np.min(score_list))
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(x_range=sentences, y_range=list(reversed(sentences)),
x_axis_location="above", plot_width=900, plot_height=900,
toolbar_location='below', tools=TOOLS,
tooltips=[('sentences', '@x @y'), ('similarity', '@similarity')])
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = 3.14 / 3
p.rect(x="x", y="y", width=1, height=1,
source=df,
fill_color={'field': 'similarity', 'transform': color_mapper},
line_color=None)
color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5),
color_mapper=color_mapper, major_label_text_font_size="7pt",
label_standoff=6, border_line_color=None, location=(0, 0))
p.add_layout(color_bar, 'right')
if use_notebook:
output_notebook()
show(p)
else:
export_png(p, filename)
print("save @ " + filename)
def visualize_self_attention_scores(tokens, scores, filename="/notebooks/embedding/self-attention.png",
use_notebook=False):
mean_prob = np.mean(scores)
weighted_edges = []
for idx_1, token_prob_dist_1 in enumerate(scores):
for idx_2, el in enumerate(token_prob_dist_1):
if idx_1 == idx_2 or el < mean_prob:
weighted_edges.append((tokens[idx_1], tokens[idx_2], 0))
else:
weighted_edges.append((tokens[idx_1], tokens[idx_2], el))
max_prob = np.max([el[2] for el in weighted_edges])
weighted_edges = [(el[0], el[1], (el[2] - mean_prob) / (max_prob - mean_prob)) for el in weighted_edges]
G = nx.Graph()
G.add_nodes_from([el for el in tokens])
G.add_weighted_edges_from(weighted_edges)
plot = Plot(plot_width=500, plot_height=500,
x_range=Range1d(-1.1, 1.1), y_range=Range1d(-1.1, 1.1))
plot.add_tools(HoverTool(tooltips=None), TapTool(), BoxSelectTool())
graph_renderer = from_networkx(G, nx.circular_layout, scale=1, center=(0, 0))
graph_renderer.node_renderer.data_source.data['colors'] = Spectral8[:len(tokens)]
graph_renderer.node_renderer.glyph = Circle(size=15, line_color=None, fill_color="colors")
graph_renderer.node_renderer.selection_glyph = Circle(size=15, fill_color="colors")
graph_renderer.node_renderer.hover_glyph = Circle(size=15, fill_color="grey")
graph_renderer.edge_renderer.data_source.data["line_width"] = [G.get_edge_data(a, b)['weight'] * 3 for a, b in
G.edges()]
graph_renderer.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_width={'field': 'line_width'})
graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color="grey", line_width=5)
graph_renderer.edge_renderer.hover_glyph = MultiLine(line_color="grey", line_width=5)
graph_renderer.selection_policy = NodesAndLinkedEdges()
graph_renderer.inspection_policy = EdgesAndLinkedNodes()
plot.renderers.append(graph_renderer)
x, y = zip(*graph_renderer.layout_provider.graph_layout.values())
data = {'x': list(x), 'y': list(y), 'connectionNames': tokens}
source = ColumnDataSource(data)
labels = LabelSet(x='x', y='y', text='connectionNames', source=source, text_align='center')
plot.renderers.append(labels)
plot.add_tools(SaveTool())
if use_notebook:
output_notebook()
show(plot)
else:
export_png(plot, filename)
print("save @ " + filename)
def visualize_words(words, vecs, palette="Viridis256", filename="/notebooks/embedding/words.png",
use_notebook=False):
tsne = TSNE(n_components=2)
tsne_results = tsne.fit_transform(vecs)
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = tsne_results[:, 0], tsne_results[:, 1], list(words)
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
text_font_size="15pt", text_color="#555555",
source=source, text_align='center')
color_mapper = LinearColorMapper(palette=palette, low=min(tsne_results[:, 1]), high=max(tsne_results[:, 1]))
plot = figure(plot_width=900, plot_height=900)
plot.scatter("x", "y", size=12, source=source, color={'field': 'y', 'transform': color_mapper}, line_color=None,
fill_alpha=0.8)
plot.add_layout(labels)
if use_notebook:
output_notebook()
show(plot)
else:
export_png(plot, filename)
print("save @ " + filename)
def visualize_between_words(words, vecs, palette="Viridis256", filename="/notebooks/embedding/between-words.png",
use_notebook=False):
df_list = []
for word1_idx, word1 in enumerate(words):
for word2_idx, word2 in enumerate(words):
vec1 = vecs[word1_idx]
vec2 = vecs[word2_idx]
if np.any(vec1) and np.any(vec2):
score = cosine_similarity(X=[vec1], Y=[vec2])
df_list.append({'x': word1, 'y': word2, 'similarity': score[0][0]})
df = pd.DataFrame(df_list)
color_mapper = LinearColorMapper(palette=palette, low=1, high=0)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(x_range=list(words), y_range=list(reversed(list(words))),
x_axis_location="above", plot_width=900, plot_height=900,
toolbar_location='below', tools=TOOLS,
tooltips=[('words', '@x @y'), ('similarity', '@similarity')])
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = 3.14 / 3
p.rect(x="x", y="y", width=1, height=1,
source=df,
fill_color={'field': 'similarity', 'transform': color_mapper},
line_color=None)
color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5),
color_mapper=color_mapper, major_label_text_font_size="7pt",
label_standoff=6, border_line_color=None, location=(0, 0))
p.add_layout(color_bar, 'right')
if use_notebook:
output_notebook()
show(p)
else:
export_png(p, filename)
print("save @ " + filename)