-
Notifications
You must be signed in to change notification settings - Fork 0
/
embed_example.py
114 lines (92 loc) · 3.26 KB
/
embed_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import numpy as np
import pandas as pd
import plotly.express as px
import requests
from loguru import logger
from sklearn.manifold import TSNE
headers = {"secret": os.environ.get("TEXT_GENERATOR_SECRET")}
texts_to_embed = [
"def factorial(n):\n\tif n == 0:\n \treturn 1\n\treturn factorial(n - 1) * n\n",
"write a function to return factorial of a number",
"write a function to print a number twice",
"def print_twice(x):\n\tprint(x)\n\tprint(x)\n",
"electrical testing of a switchboard with hand holding a red wire",
"cat and dog laying on the floor",
"https://images2.minutemediacdn.com/image/upload/c_fill,w_1080,ar_16:9,f_auto,q_auto,g_auto/shape%2Fcover%2Fsport%2F516438-istock-637689912-981f23c58238ea01a6147d11f4c81765.jpg",
"https://static.text-generator.io/static/img/Screenshot%20from%202022-09-12%2010-08-50.png",
]
labels_for_graph = [
"factorial code",
"factorial prompt",
"printing prompt",
"printing code",
"electrical description",
"cat and dog",
"image of cat and dog",
"image of electrical",
]
embeddings = []
for text in texts_to_embed:
data = {
"text": text,
"num_features": 230,
}
response = requests.post(
"https://api.text-generator.io/api/v1/feature-extraction", json=data, headers=headers
)
json_response_list = response.json() # the embedding is a list of numbers
embeddings.append(json_response_list)
logger.info(embeddings)
# could also choose to embed here using PCA
# from sklearn.decomposition import PCA
# two_dim = PCA(random_state=0).fit_transform(np.array(embeddings))[:,:2]
small_embed = TSNE(
n_components=3, random_state=0, perplexity=0, learning_rate="auto", n_iter=250
).fit_transform(
np.array(embeddings)
) # takes .15s for 250k features .03s for 2.5k
df = pd.DataFrame(
data={
"x": list(map(lambda embed: embed[0], small_embed)),
"y": list(map(lambda embed: embed[1], small_embed)),
"hover_data": texts_to_embed,
}
)
# 2d plot
fig = px.scatter(df, x="x", y="y", hover_data=["hover_data"])
fig.show()
fig.write_html("embed_example2.html")
import numpy as np
from datetime import datetime
from datetime import timedelta
# could also choose to embed here using PCA
from sklearn.decomposition import PCA
start_time = datetime.now()
# small_embed = PCA(random_state=0).fit_transform(np.array(embeddings))[:,:2]
end_time = datetime.now()
print(f"TSNE time taken {end_time - start_time}")
print(small_embed)
df = pd.DataFrame(
data={
"x": list(map(lambda embed: embed[0], small_embed)),
"y": list(map(lambda embed: embed[1], small_embed)),
"z": list(map(lambda embed: embed[2], small_embed)),
"hover_data": labels_for_graph,
}
)
fig = px.scatter_3d(df, x="x", y="y", z="z", hover_data=["hover_data"])
fig.show()
fig.write_html("embed_example8.html", include_plotlyjs=False)
from scipy.spatial import distance_matrix
from scipy.spatial.distance import euclidean
def m_euclid(v1, v2):
return euclidean(v1, v2)
dist_list = []
for j1 in embeddings:
dist_list.append([m_euclid(j1, j2) for j2 in embeddings])
dist_matrix = pd.DataFrame(dist_list)
dist_matrix.columns = labels_for_graph
print(dist_matrix)
fig2 = px.imshow(dist_matrix, y=labels_for_graph)
fig2.show()