Higher Performance of Botrgcn on Twibot22? #30

un0o7 · 2023-04-09T02:37:59Z

I follow the same preprocess method on Twibot22. However, when I try BotRGCN method on it, the results are better than yours.

Can you provide me with processed Twibot22 dataset, So that I can test on it.

BunsenFeng · 2023-04-10T02:07:39Z

Thank you for reporting these results. I believe @leopoldwhite and @whr000001 are looking into this now.

whr000001 · 2023-04-12T07:17:19Z

Hi, thank you for your interest in our work. Could you please provide more detailed information about how you train and test BotRGCN and how to preprocess the dataset? Thanks.

un0o7 · 2023-04-12T07:20:18Z

`
import torch
from torch.utils.data import Dataset
from torch import nn
from torch_geometric.nn import RGCNConv
import torch.nn.functional as F
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
import random
import numpy as np

def seed_everything(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

seed_everything(2026)

class Twibot22(Dataset):

def __init__(
    self,
    root='./Data/',
    device='cpu',
):
    self.root = root
    self.device = device

def train_val_test_mask(self):

    train_idx = torch.load(self.root + 'train_idx.pt')
    val_idx = torch.load(self.root + 'val_idx.pt')
    test_idx = torch.load(self.root + 'test_idx.pt')

    return train_idx, val_idx, test_idx

def dataloader(self):
    labels = torch.load(self.root + 'label.pt').to(self.device)
    des_tensor = torch.load(self.root + 'des_tensor.pt').to(self.device)
    tweets_tensor = torch.load(self.root + 'tweets_tensor.pt').to(
        self.device)
    num_prop = torch.load(self.root + 'num_properties_tensor.pt').to(
        self.device)
    category_prop = torch.load(self.root + 'cat_properties_tensor.pt').to(
        self.device)
    edge_index = torch.load(self.root + 'edge_index.pt').to(self.device)
    edge_type = torch.load(self.root + 'edge_type.pt').to(self.device)

    train_idx, val_idx, test_idx = self.train_val_test_mask()
    return des_tensor, tweets_tensor, num_prop, category_prop, edge_index, edge_type, labels, train_idx, val_idx, test_idx

class BotRGCN(nn.Module):

def __init__(self,
             des_size=768,
             tweet_size=768,
             num_prop_size=5,
             cat_prop_size=3,
             embedding_dimension=128,
             dropout=0.3):
    super(BotRGCN, self).__init__()
    self.dropout = dropout
    self.linear_relu_des = nn.Sequential(
        nn.Linear(des_size, int(embedding_dimension / 4)), nn.LeakyReLU())
    self.linear_relu_tweet = nn.Sequential(
        nn.Linear(tweet_size, int(embedding_dimension / 4)),
        nn.LeakyReLU())
    self.linear_relu_num_prop = nn.Sequential(
        nn.Linear(num_prop_size, int(embedding_dimension / 4)),
        nn.LeakyReLU())
    self.linear_relu_cat_prop = nn.Sequential(
        nn.Linear(cat_prop_size, int(embedding_dimension / 4)),
        nn.LeakyReLU())

    self.linear_relu_input = nn.Sequential(
        nn.Linear(embedding_dimension, embedding_dimension),
        nn.LeakyReLU())

    self.rgcn = RGCNConv(embedding_dimension,
                         embedding_dimension,
                         num_relations=2)

    self.linear_relu_output1 = nn.Sequential(
        nn.Linear(embedding_dimension, embedding_dimension),
        nn.LeakyReLU())
    self.linear_output2 = nn.Linear(embedding_dimension, 2)

def forward(self, des, tweet, num_prop, cat_prop, edge_index, edge_type):
    d = self.linear_relu_des(des)
    t = self.linear_relu_tweet(tweet)
    n = self.linear_relu_num_prop(num_prop)
    c = self.linear_relu_cat_prop(cat_prop)
    x = torch.cat((d, t, n, c), dim=1)

    x = self.linear_relu_input(x)
    x = self.rgcn(x, edge_index, edge_type)
    x = F.dropout(x, p=self.dropout, training=self.training)
    x = self.rgcn(x, edge_index, edge_type)
    x = self.linear_relu_output1(x)
    x = self.linear_output2(x)

    return x

def accuracy(output, labels):
preds = output.max(1)[1].type_as(labels)
correct = preds.eq(labels).double()
correct = correct.sum()
return correct / len(labels)

def init_weights(m):
if type(m) == nn.Linear:
nn.init.kaiming_uniform_(m.weight)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
embedding_size, dropout, lr, weight_decay = 32, 0.1, 1e-2, 5e-2

root = './dataset/twibot-22/'

dataset = Twibot22(root=root, device=device)
des_tensor, tweets_tensor, num_prop, category_prop, edge_index, edge_type, labels, train_idx, val_idx, test_idx = dataset.dataloader(
)

model = BotRGCN(cat_prop_size=3, embedding_dimension=embedding_size).to(device)
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(),
lr=lr,
weight_decay=weight_decay)

def train(epoch):
model.train()
output = model(des_tensor, tweets_tensor, num_prop, category_prop,
edge_index, edge_type)
loss_train = loss(output[train_idx], labels[train_idx])
acc_train = accuracy(output[train_idx], labels[train_idx])
acc_val = accuracy(output[val_idx], labels[val_idx])
optimizer.zero_grad()
loss_train.backward()
optimizer.step()
print(
'Epoch: {:04d}'.format(epoch + 1),
'loss_train: {:.4f}'.format(loss_train.item()),
'acc_train: {:.4f}'.format(acc_train.item()),
'acc_val: {:.4f}'.format(acc_val.item()),
)
return acc_train, loss_train

def test():
model.eval()
output = model(des_tensor, tweets_tensor, num_prop, category_prop,
edge_index, edge_type)
loss_test = loss(output[test_idx], labels[test_idx])
acc_test = accuracy(output[test_idx], labels[test_idx])
output = output.max(1)[1].to('cpu').detach().numpy()
label = labels.to('cpu').detach().numpy()
f1 = f1_score(label[test_idx], output[test_idx])
# mcc=matthews_corrcoef(label[test_idx], output[test_idx])
precision = precision_score(label[test_idx], output[test_idx])
recall = recall_score(label[test_idx], output[test_idx])
fpr, tpr, thresholds = roc_curve(label[test_idx],
output[test_idx],
pos_label=1)
Auc = auc(fpr, tpr)
print(
"Test set results:",
"test_loss= {:.4f}".format(loss_test.item()),
"test_accuracy= {:.4f}".format(acc_test.item()),
"precision= {:.4f}".format(precision.item()),
"recall= {:.4f}".format(recall.item()),
"f1_score= {:.4f}".format(f1.item()),
# "mcc= {:.4f}".format(mcc.item()),
"auc= {:.4f}".format(Auc.item()),
)

model.apply(init_weights)

epochs = 200
for epoch in range(epochs):
train(epoch)

test()

`

whr000001 · 2023-04-12T08:00:50Z

I think your model and training codes are the same as these. Could you please provide more detailed information about your processed dataset, like how many users are in the train/val/test set, and how to split them?

un0o7 · 2023-04-12T08:04:01Z

I follow the same split provided in split.csv. There are 1000000 users in total and 700000 for training, 200000 for validating and 100000 for test.

whr000001 · 2023-04-12T09:22:18Z

We use your code to train with our processed data, but can not achieve your performance. This may be due to differences in preprocessing. Could you please provide your preprocess codes? I believe @leopoldwhite is looking into this now.

un0o7 · 2023-04-12T10:24:07Z

Due to my limited computing resources, I split the tweets into three parts（0-2，3-5，6-8） and then merge them. I think that may be where the problem lies in.

import torch
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import ijson

user = pd.read_json('user.json')

user_idx = user['id']
uid_index = {uid: index for index, uid in enumerate(user_idx.values)}

# id_tweet={i:[] for i in range(len(user_idx))}

id_tweet = [{'index': i, 'tweets': []} for i in range(len(user_idx))]

for i in range(9):
    name = 'tweet_' + str(i) + '.json'
    if i < 3:
        user_tweets = ijson.items(open(name, 'r'), 'item')
    elif i < 6:
        user_tweets = ijson.items(open(name, 'r'), 'item')
    else:
        user_tweets = ijson.items(open(name, 'r'), 'item')
    print("load " + name + " succ")
    for each in tqdm(user_tweets):
        uid = 'u' + str(each['author_id'])
        text = each['text']
        try:
            index = uid_index[uid]

            id_tweet[index]['tweets'].append(text)
        except KeyError:
            continue
    if i == 2:
        json.dump(id_tweet, open('./id_tweet0.json', 'w'))
        id_tweet = [{'index': i, 'tweets': []} for i in range(len(user_idx))]
    if i == 5:
        json.dump(id_tweet, open('./id_tweet1.json', 'w'))
        id_tweet = [{'index': i, 'tweets': []} for i in range(len(user_idx))]

print("succ")
json.dump(id_tweet, open('./id_tweet2.json', 'w'))

un0o7 · 2023-04-12T10:24:24Z

And then I merge these three parts into id_tweet.json.

import torch
import ijson
from transformers import pipeline
import json
from tqdm import tqdm

tweet1_path = "id_tweet0.json"
tweet2_path = "id_tweet1.json"
tweet3_path = "id_tweet2.json"

tweets1 = ijson.items(open(tweet1_path, 'r'), "item")
tweets2 = ijson.items(open(tweet2_path, 'r'), "item")
tweets3 = ijson.items(open(tweet3_path, 'r'), "item")
count = 0

id_tweet = {i: [] for i in range(250000)}
for i, (tweet1, tweet2,
        tweet3) in tqdm(enumerate(zip(tweets1, tweets2, tweets3))):
    if i % 250000 == 0 and i != 0:
        json.dump(id_tweet, open('./id_tweet' + str(count) + '.json', 'w'))
        id_tweet = {i: [] for i in range(i, i + 250000)}
        count += 1

    temp = tweet1['tweets'] + tweet2['tweets'] + tweet3['tweets']
    id_tweet[i].append(temp)

json.dump(id_tweet, open('./id_tweet' + str(count) + '.json', 'w'))

id_tweet = [[] for i in range(1000000)]
for i, (tweet1, tweet2,
        tweet3) in tqdm(enumerate(zip(tweets1, tweets2, tweets3))):
    temp = tweet1['tweets'] + tweet2['tweets'] + tweet3['tweets']
    temp = temp[:20]
    id_tweet[i].extend(temp)

json.dump(id_tweet, open('./id_tweet.json', 'w'))

un0o7 · 2023-04-12T10:25:09Z

Last part is the same with yours.

import torch
from tqdm import tqdm
import numpy as np
from transformers import pipeline
import os
import pandas as pd
import json
import ijson

user_tweets = ijson.items(open("id_tweet.json", 'r'), 'item')

feature_extract = pipeline('feature-extraction',
                           model='roberta-base',
                           tokenizer='roberta-base',
                           device=1,
                           padding=True,
                           truncation=True,
                           max_length=50,
                           add_special_tokens=True)


def tweets_embedding():
    print('Running feature2 embedding')
    path = "./tweets_tensor.pt"
    if True:
        tweets_list = []
        for i, each_user_tweets in enumerate(user_tweets):
            if i % 1000 == 0:
                print(i)
            if len(each_user_tweets) == 0:
                total_each_person_tweets = torch.zeros(768)
            else:
                for j in range(len(each_user_tweets)):
                    each_tweet = each_user_tweets[j]
                    if each_tweet is None:
                        total_word_tensor = torch.zeros(768)
                    else:
                        each_tweet_tensor = torch.tensor(
                            feature_extract(each_tweet))
                        for k, each_word_tensor in enumerate(
                                each_tweet_tensor[0]):
                            if k == 0:
                                total_word_tensor = each_word_tensor
                            else:
                                total_word_tensor += each_word_tensor
                        total_word_tensor /= each_tweet_tensor.shape[1]
                    if j == 0:
                        total_each_person_tweets = total_word_tensor
                    elif j == 20:
                        break
                    else:
                        total_each_person_tweets += total_word_tensor
                if (j == 20):
                    total_each_person_tweets /= 20
                else:
                    total_each_person_tweets /= len(each_user_tweets)

            tweets_list.append(total_each_person_tweets)

        tweet_tensor = torch.stack(tweets_list)
        torch.save(tweet_tensor, path)

    else:
        tweets_tensor = torch.load(path)
    print('Finished')


tweets_embedding()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Higher Performance of Botrgcn on Twibot22? #30

Higher Performance of Botrgcn on Twibot22? #30

un0o7 commented Apr 9, 2023

BunsenFeng commented Apr 10, 2023

whr000001 commented Apr 12, 2023

un0o7 commented Apr 12, 2023

whr000001 commented Apr 12, 2023

un0o7 commented Apr 12, 2023 •

edited

Loading

whr000001 commented Apr 12, 2023

un0o7 commented Apr 12, 2023 •

edited

Loading

un0o7 commented Apr 12, 2023 •

edited

Loading

un0o7 commented Apr 12, 2023

Higher Performance of Botrgcn on Twibot22? #30

Higher Performance of Botrgcn on Twibot22? #30

Comments

un0o7 commented Apr 9, 2023

BunsenFeng commented Apr 10, 2023

whr000001 commented Apr 12, 2023

un0o7 commented Apr 12, 2023

whr000001 commented Apr 12, 2023

un0o7 commented Apr 12, 2023 • edited Loading

whr000001 commented Apr 12, 2023

un0o7 commented Apr 12, 2023 • edited Loading

un0o7 commented Apr 12, 2023 • edited Loading

un0o7 commented Apr 12, 2023

un0o7 commented Apr 12, 2023 •

edited

Loading

un0o7 commented Apr 12, 2023 •

edited

Loading

un0o7 commented Apr 12, 2023 •

edited

Loading