-
Notifications
You must be signed in to change notification settings - Fork 0
/
excel_mapping.py
56 lines (40 loc) · 1.96 KB
/
excel_mapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
### An algorithm to find similar columns from two csv files, with the help from the offline interaction through chatGPT prompt engineering
### This works well with the sample data test, though simple enough and without involving LLM in the code
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
df1 = pd.read_csv('template.csv')
df2 = pd.read_csv('table_A.csv') # table_B.csv
def compute_similar_columns(df1, df2, threshold):
similar_columns = []
# for efficiency, only use top 1000 rows
df1 = df1.head(100)
df2 = df2.head(100)
for column1 in df1.columns:
values1 = df1[column1].tolist()
# add column name into the value list
values1.insert(0, column1)
for column2 in df2.columns:
values2 = df2[column2].tolist()
values2.insert(0, column2)
# Compute cosine similarity
similarity = compute_cosine_similarity(values1, values2)
if similarity > threshold:
similar_columns.append((column1, column2, similarity))
return similar_columns
def compute_cosine_similarity(list1, list2):
# Convert the input lists to strings
string1 = ' '.join(map(str, list1))
string2 = ' '.join(map(str, list2))
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(analyzer="char")
# Compute the TF-IDF matrix for the two strings
tfidf_matrix = vectorizer.fit_transform([string1, string2])
# Compute the cosine similarity between the two TF-IDF vectors
similarity_matrix = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
# Extract the similarity score from the matrix
similarity_score = similarity_matrix[0][0]
return similarity_score
similar_columns = compute_similar_columns(df1, df2, 0.8)
for column1, column2, similarity in similar_columns:
print(f"Similar columns: {column1} (File 1) and {column2} (File 2), Similarity: {similarity}")