-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnoun_proc.py
90 lines (77 loc) · 2.15 KB
/
noun_proc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from konlpy.tag import Okt
okt = Okt()
# from konlpy.tag import Mecab
# mecab = Mecab()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
# for NAVER articles
def article_proc(article):
art = article.split("▶")
if len(art) > 1:
icle = art[:-3]
s = ""
for t in icle:
s += t
article = s
return article
# okt
def get_morphs(text):
tags = okt.pos(text, norm='True', stem='True')
l = []
for words in tags:
l.append(words[0])
return l
# #mecab
# def get_morphs(text):
# tags = mecab.pos(text)
# l = []
# for words in tags:
# l.append(words[0])
# return l
def nonset_jac_sim(article, text):
article = get_morphs(article)
comments = get_morphs(text)
inter = []
for comment in comments:
if comment in article:
inter.append(comment)
uni = set(comments).union(set(article))
return len(inter)/len(uni)
def tfid_vectorize(article, comment):
sent = (article, comment)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sent) #문장 벡터화 진행
return tfidf_matrix
def cos_sim(matrix):
sim = cosine_similarity(matrix[0:1], matrix[1:2]) #첫번째와 두번째 문장 비교
return sim[0][0]
def l1_normalize(v):
norm = np.sum(v)
return v / norm
def nor_euc_dis(matrix):
tfidf_norm_l1 = l1_normalize(matrix)
sim = euclidean_distances(tfidf_norm_l1[0:1], tfidf_norm_l1[1:2])
return sim[0][0]
def list_calculate(l1, l2, k):
if k == 0:
l = l1
elif k == 1:
ldf = pd.DataFrame([l1]+[l2])
ldf = ldf.T
ldf[2] = ldf[0] + ldf[1]
l =ldf[2].tolist()
elif k == -1:
ldf = pd.DataFrame([l1]+[l2])
ldf = ldf.T
ldf[2] = ldf[0] - ldf[1]
l =ldf[2].tolist()
elif k == 2:
ldf = pd.DataFrame([l1]+[l2])
ldf = ldf.T
ldf[2] = (ldf[0] * ldf[1])**(1/2)
l =ldf[2].tolist()
return l