forked from LuoUndergradXJTU/TwiBot-22
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rand_forest.py
150 lines (141 loc) · 7.69 KB
/
rand_forest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import json
import ijson
import argparse
import pandas as pd
import numpy as np
from feature_engineering import feature_preprocess
from feature_twibot22 import preprocess
from feature_supplement import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default='Twibot-20', help='Choose the dataset.')
arg = parser.parse_args()
DATASET = arg.dataset
if DATASET == 'Twibot-22':
label = pd.read_csv('../../datasets/Twibot-22/label.csv')
split = pd.read_csv('../../datasets/Twibot-22/split.csv')
user = list(ijson.items(open('../../datasets/Twibot-22/user.json', 'r'), 'item'))
author = []
tid = []
tweet = []
for i in range(9):
tweet = tweet + list(ijson.items(open('../../datasets/Twibot-22/tweet_' + str(i) + '.json', 'r'), 'item.text'))
tid = tid + list(ijson.items(open('../../datasets/Twibot-22/tweet_' + str(i) + '.json', 'r'), 'item.id'))
author = author + list(ijson.items(open('../../datasets/Twibot-22/tweet_' + str(i) + '.json', 'r'), 'item.author_id'))
edge = pd.read_csv('../../datasets/Twibot-22/edge.csv')
id_tweet = dict()
id_map = dict()
num_user = len(user)
for i in range(num_user):
id_map[user[i]['id']] = i
for i in range(len(tid)):
if id_map[author[i]] in id_tweet.keys():
id_tweet[id_map[author[i]]].append(tweet[i])
else:
id_tweet[id_map[author[i]]] = [tweet[i]]
label_order = np.array(label['label'].values)
split_order = np.array(split['split'].values)
for i in range(num_user):
label_order[id_map[label['id'][i]]] = label['label'][i]
split_order[id_map[split['id'][i]]] = split['split'][i]
y = (label_order == 'bot').astype(int)
train_split = split_order[0: num_user] == 'train'
val_split = split_order[0: num_user] == 'valid'
test_split = split_order[0: num_user] == 'test'
train_set = np.where(split_order == 'train')[0]
val_set = np.where(split_order == 'valid')[0]
test_set = np.where(split_order == 'test')[0]
print(f"train: {len(train_set)}, val: {len(val_set)}, test: {len(test_set)}")
if os.path.exists('feature_matrix_Twibot-22.csv'):
X = pd.read_csv('feature_matrix_Twibot-22.csv').values
else:
X = preprocess(user, tid, author, edge, id_tweet)
print(f"X shape: {X.shape}")
for i in range(X.shape[0]):
X[i][np.isnan(X[i])] = np.nanmean(X[i])
else:
node = json.load(open('../../datasets/' + DATASET + '/node.json', 'r'))
label = pd.read_csv('../../datasets/' + DATASET + '/label.csv')
split = pd.read_csv('../../datasets/' + DATASET + '/split.csv')
edge = pd.read_csv('../../datasets/' + DATASET + '/edge.csv')
id_map = dict()
for i in range(len(node)):
id_map[node[i]['id']] = i
num_user = label.shape[0]
label_order = np.array(label['label'].values)
split_order = np.array(split['split'].values)
for i in range(num_user):
label_order[id_map[label['id'][i]]] = label['label'][i]
split_order[id_map[split['id'][i]]] = split['split'][i]
y = (label_order == 'bot').astype(int)
train_split = split_order[0: num_user] == 'train'
val_split = split_order[0: num_user] == 'val'
test_split = split_order[0: num_user] == 'test'
train_set = np.where(split_order == 'train')[0]
val_set = np.where(split_order == 'val')[0]
test_set = np.where(split_order == 'test')[0]
print(f"train: {len(train_set)}, val: {len(val_set)}, test: {len(test_set)}")
if os.path.exists('feature_matrix_' + DATASET + '.csv'):
X = pd.read_csv('feature_matrix_' + DATASET + '.csv').values[0: num_user]
else:
X = feature_preprocess(node, edge, DATASET)[0: num_user]
# X = preprocessing(node, edge, DATASET)[0: num_user]
print(f"X shape: {X.shape}")
for i in range(X.shape[0]):
X[i][np.isnan(X[i])] = np.nanmean(X[i])
acc = []
precision = []
recall = []
f1 = []
auc = []
clf = RandomForestClassifier(oob_score=True, bootstrap=True, random_state=0)
clf.fit(X[train_set], y[train_set])
test_result = clf.predict(X[test_set])
print(f"acc: {accuracy_score(y[test_set], test_result):.4f}, precision: {precision_score(y[test_set], test_result):.4f}, recall: {recall_score(y[test_set], test_result):.4f}, f1-score: {f1_score(y[test_set], test_result):.4f}, roc_auc: {roc_auc_score(y[test_set], test_result):.4f}")
acc.append(accuracy_score(y[test_set], test_result))
precision.append(precision_score(y[test_set], test_result))
recall.append(recall_score(y[test_set], test_result))
f1.append(f1_score(y[test_set], test_result))
auc.append(roc_auc_score(y[test_set], test_result))
clf = RandomForestClassifier(oob_score=True, bootstrap=True, random_state=100)
clf.fit(X[train_set], y[train_set])
test_result = clf.predict(X[test_set])
print(f"acc: {accuracy_score(y[test_set], test_result):.4f}, precision: {precision_score(y[test_set], test_result):.4f}, recall: {recall_score(y[test_set], test_result):.4f}, f1-score: {f1_score(y[test_set], test_result):.4f}, roc_auc: {roc_auc_score(y[test_set], test_result):.4f}")
acc.append(accuracy_score(y[test_set], test_result))
precision.append(precision_score(y[test_set], test_result))
recall.append(recall_score(y[test_set], test_result))
f1.append(f1_score(y[test_set], test_result))
auc.append(roc_auc_score(y[test_set], test_result))
clf = RandomForestClassifier(oob_score=True, bootstrap=True, random_state=200)
clf.fit(X[train_set], y[train_set])
test_result = clf.predict(X[test_set])
print(f"acc: {accuracy_score(y[test_set], test_result):.4f}, precision: {precision_score(y[test_set], test_result):.4f}, recall: {recall_score(y[test_set], test_result):.4f}, f1-score: {f1_score(y[test_set], test_result):.4f}, roc_auc: {roc_auc_score(y[test_set], test_result):.4f}")
acc.append(accuracy_score(y[test_set], test_result))
precision.append(precision_score(y[test_set], test_result))
recall.append(recall_score(y[test_set], test_result))
f1.append(f1_score(y[test_set], test_result))
auc.append(roc_auc_score(y[test_set], test_result))
clf = RandomForestClassifier(oob_score=True, bootstrap=True, random_state=300)
clf.fit(X[train_set], y[train_set])
test_result = clf.predict(X[test_set])
print(f"acc: {accuracy_score(y[test_set], test_result):.4f}, precision: {precision_score(y[test_set], test_result):.4f}, recall: {recall_score(y[test_set], test_result):.4f}, f1-score: {f1_score(y[test_set], test_result):.4f}, roc_auc: {roc_auc_score(y[test_set], test_result):.4f}")
acc.append(accuracy_score(y[test_set], test_result))
precision.append(precision_score(y[test_set], test_result))
recall.append(recall_score(y[test_set], test_result))
f1.append(f1_score(y[test_set], test_result))
auc.append(roc_auc_score(y[test_set], test_result))
clf = RandomForestClassifier(oob_score=True, bootstrap=True, random_state=400)
clf.fit(X[train_set], y[train_set])
test_result = clf.predict(X[test_set])
print(f"acc: {accuracy_score(y[test_set], test_result):.4f}, precision: {precision_score(y[test_set], test_result):.4f}, recall: {recall_score(y[test_set], test_result):.4f}, f1-score: {f1_score(y[test_set], test_result):.4f}, roc_auc: {roc_auc_score(y[test_set], test_result):.4f}")
acc.append(accuracy_score(y[test_set], test_result))
precision.append(precision_score(y[test_set], test_result))
recall.append(recall_score(y[test_set], test_result))
f1.append(f1_score(y[test_set], test_result))
auc.append(roc_auc_score(y[test_set], test_result))
if not os.path.exists('results'):
os.mkdir('results')
results = pd.DataFrame({'acc': acc, 'precision': precision, 'recall': recall, 'f1': f1, 'auc': auc})
results.to_csv('results/' + DATASET + '.csv', index=False)