-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSVM.py
97 lines (79 loc) · 3.23 KB
/
SVM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from gensim import corpora, models
from nltk.tokenize import word_tokenize
import numpy as np
import random
import os
import nltk
nltk.download('punkt')
# 定义提取段落的函数
def extract_paragraphs(corpus_dir, num_paragraphs, max_tokens,charwords):
paragraphs = []
labels = []
for novel_file in os.listdir(corpus_dir):
if novel_file.endswith('.txt'):
novel_path = os.path.join(corpus_dir, novel_file)
# 读取小说内容
with open(novel_path, 'r', encoding='gbk', errors='ignore') as file:
novel_text = file.read()
novel_paragraphs = novel_text.split('\n')
# 随机抽取一定数量的段落
random.shuffle(novel_paragraphs)
for paragraph in novel_paragraphs:
if charwords=="words":
words = word_tokenize(paragraph)
if len(words) <= max_tokens:
paragraphs.append(words)
labels.append(novel_file[:-4]) # 小说文件名作为标签
if len(paragraphs) == num_paragraphs:
break
return paragraphs, labels
# 定义LDA模型
def train_lda_model(paragraphs, num_topics):
# 创建字典和语料库
dictionary = corpora.Dictionary(paragraphs)
corpus = [dictionary.doc2bow(words) for words in paragraphs]
# 训练LDA模型
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
return lda_model
if __name__ == '__main__':
# 语料库路径
corpus_dir = r'D:\dpwork\data'
# 设置主题数量
num_topics = 20
# 参数设置
num_paragraphs = 1000
max_tokens = [20, 100, 500, 1000, 3000]
list = ["char", "words"]
charwords = list[1]
if charwords == "char":
print("字分析:")
else:
print("词分析")
# 10次交叉验证
for k in max_tokens:
print(f"Max tokens: {k}")
paragraphs, labels = extract_paragraphs(corpus_dir, num_paragraphs, k,charwords)
# 从 extract_paragraphs 函数返回的 labels
all_labels_set = set(labels)
lda_model = train_lda_model(paragraphs, num_topics=num_topics)
# 将段落表示为主题分布
topics_distribution = np.zeros((len(paragraphs), num_topics))
for i, words in enumerate(paragraphs):
bow_vector = lda_model.id2word.doc2bow(words)
topics = lda_model[bow_vector]
for topic in topics:
topics_distribution[i, topic[0]] = topic[1]
# 拆分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(topics_distribution, labels, test_size=0.2, random_state=42)
# 构建SVM模型
model = SVC(kernel='linear')
# 训练
model.fit(X_train, y_train)
# 计算训练准确度
train_accuracy = model.score(X_train, y_train)
print("Train accuracy:", train_accuracy)
# 计算测试准确度
test_accuracy = model.score(X_test, y_test)
print("Test accuracy:", test_accuracy)