Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Higher Performance of Botrgcn on Twibot22? #30

Open
un0o7 opened this issue Apr 9, 2023 · 9 comments
Open

Higher Performance of Botrgcn on Twibot22? #30

un0o7 opened this issue Apr 9, 2023 · 9 comments

Comments

@un0o7
Copy link

un0o7 commented Apr 9, 2023

I follow the same preprocess method on Twibot22. However, when I try BotRGCN method on it, the results are better than yours.
image
Can you provide me with processed Twibot22 dataset, So that I can test on it.

@BunsenFeng
Copy link
Contributor

Thank you for reporting these results. I believe @leopoldwhite and @whr000001 are looking into this now.

@whr000001
Copy link
Collaborator

Hi, thank you for your interest in our work. Could you please provide more detailed information about how you train and test BotRGCN and how to preprocess the dataset? Thanks.

@un0o7
Copy link
Author

un0o7 commented Apr 12, 2023

`
import torch
from torch.utils.data import Dataset
from torch import nn
from torch_geometric.nn import RGCNConv
import torch.nn.functional as F
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
import random
import numpy as np

def seed_everything(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

seed_everything(2026)

class Twibot22(Dataset):

def __init__(
    self,
    root='./Data/',
    device='cpu',
):
    self.root = root
    self.device = device

def train_val_test_mask(self):

    train_idx = torch.load(self.root + 'train_idx.pt')
    val_idx = torch.load(self.root + 'val_idx.pt')
    test_idx = torch.load(self.root + 'test_idx.pt')

    return train_idx, val_idx, test_idx

def dataloader(self):
    labels = torch.load(self.root + 'label.pt').to(self.device)
    des_tensor = torch.load(self.root + 'des_tensor.pt').to(self.device)
    tweets_tensor = torch.load(self.root + 'tweets_tensor.pt').to(
        self.device)
    num_prop = torch.load(self.root + 'num_properties_tensor.pt').to(
        self.device)
    category_prop = torch.load(self.root + 'cat_properties_tensor.pt').to(
        self.device)
    edge_index = torch.load(self.root + 'edge_index.pt').to(self.device)
    edge_type = torch.load(self.root + 'edge_type.pt').to(self.device)

    train_idx, val_idx, test_idx = self.train_val_test_mask()
    return des_tensor, tweets_tensor, num_prop, category_prop, edge_index, edge_type, labels, train_idx, val_idx, test_idx

class BotRGCN(nn.Module):

def __init__(self,
             des_size=768,
             tweet_size=768,
             num_prop_size=5,
             cat_prop_size=3,
             embedding_dimension=128,
             dropout=0.3):
    super(BotRGCN, self).__init__()
    self.dropout = dropout
    self.linear_relu_des = nn.Sequential(
        nn.Linear(des_size, int(embedding_dimension / 4)), nn.LeakyReLU())
    self.linear_relu_tweet = nn.Sequential(
        nn.Linear(tweet_size, int(embedding_dimension / 4)),
        nn.LeakyReLU())
    self.linear_relu_num_prop = nn.Sequential(
        nn.Linear(num_prop_size, int(embedding_dimension / 4)),
        nn.LeakyReLU())
    self.linear_relu_cat_prop = nn.Sequential(
        nn.Linear(cat_prop_size, int(embedding_dimension / 4)),
        nn.LeakyReLU())

    self.linear_relu_input = nn.Sequential(
        nn.Linear(embedding_dimension, embedding_dimension),
        nn.LeakyReLU())

    self.rgcn = RGCNConv(embedding_dimension,
                         embedding_dimension,
                         num_relations=2)

    self.linear_relu_output1 = nn.Sequential(
        nn.Linear(embedding_dimension, embedding_dimension),
        nn.LeakyReLU())
    self.linear_output2 = nn.Linear(embedding_dimension, 2)

def forward(self, des, tweet, num_prop, cat_prop, edge_index, edge_type):
    d = self.linear_relu_des(des)
    t = self.linear_relu_tweet(tweet)
    n = self.linear_relu_num_prop(num_prop)
    c = self.linear_relu_cat_prop(cat_prop)
    x = torch.cat((d, t, n, c), dim=1)

    x = self.linear_relu_input(x)
    x = self.rgcn(x, edge_index, edge_type)
    x = F.dropout(x, p=self.dropout, training=self.training)
    x = self.rgcn(x, edge_index, edge_type)
    x = self.linear_relu_output1(x)
    x = self.linear_output2(x)

    return x

def accuracy(output, labels):
preds = output.max(1)[1].type_as(labels)
correct = preds.eq(labels).double()
correct = correct.sum()
return correct / len(labels)

def init_weights(m):
if type(m) == nn.Linear:
nn.init.kaiming_uniform_(m.weight)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
embedding_size, dropout, lr, weight_decay = 32, 0.1, 1e-2, 5e-2

root = './dataset/twibot-22/'

dataset = Twibot22(root=root, device=device)
des_tensor, tweets_tensor, num_prop, category_prop, edge_index, edge_type, labels, train_idx, val_idx, test_idx = dataset.dataloader(
)

model = BotRGCN(cat_prop_size=3, embedding_dimension=embedding_size).to(device)
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(),
lr=lr,
weight_decay=weight_decay)

def train(epoch):
model.train()
output = model(des_tensor, tweets_tensor, num_prop, category_prop,
edge_index, edge_type)
loss_train = loss(output[train_idx], labels[train_idx])
acc_train = accuracy(output[train_idx], labels[train_idx])
acc_val = accuracy(output[val_idx], labels[val_idx])
optimizer.zero_grad()
loss_train.backward()
optimizer.step()
print(
'Epoch: {:04d}'.format(epoch + 1),
'loss_train: {:.4f}'.format(loss_train.item()),
'acc_train: {:.4f}'.format(acc_train.item()),
'acc_val: {:.4f}'.format(acc_val.item()),
)
return acc_train, loss_train

def test():
model.eval()
output = model(des_tensor, tweets_tensor, num_prop, category_prop,
edge_index, edge_type)
loss_test = loss(output[test_idx], labels[test_idx])
acc_test = accuracy(output[test_idx], labels[test_idx])
output = output.max(1)[1].to('cpu').detach().numpy()
label = labels.to('cpu').detach().numpy()
f1 = f1_score(label[test_idx], output[test_idx])
# mcc=matthews_corrcoef(label[test_idx], output[test_idx])
precision = precision_score(label[test_idx], output[test_idx])
recall = recall_score(label[test_idx], output[test_idx])
fpr, tpr, thresholds = roc_curve(label[test_idx],
output[test_idx],
pos_label=1)
Auc = auc(fpr, tpr)
print(
"Test set results:",
"test_loss= {:.4f}".format(loss_test.item()),
"test_accuracy= {:.4f}".format(acc_test.item()),
"precision= {:.4f}".format(precision.item()),
"recall= {:.4f}".format(recall.item()),
"f1_score= {:.4f}".format(f1.item()),
# "mcc= {:.4f}".format(mcc.item()),
"auc= {:.4f}".format(Auc.item()),
)

model.apply(init_weights)

epochs = 200
for epoch in range(epochs):
train(epoch)

test()

`

@whr000001
Copy link
Collaborator

I think your model and training codes are the same as these. Could you please provide more detailed information about your processed dataset, like how many users are in the train/val/test set, and how to split them?

@un0o7
Copy link
Author

un0o7 commented Apr 12, 2023

I follow the same split provided in split.csv. There are 1000000 users in total and 700000 for training, 200000 for validating and 100000 for test.

@whr000001
Copy link
Collaborator

We use your code to train with our processed data, but can not achieve your performance. This may be due to differences in preprocessing. Could you please provide your preprocess codes? I believe @leopoldwhite is looking into this now.

@un0o7
Copy link
Author

un0o7 commented Apr 12, 2023

Due to my limited computing resources, I split the tweets into three parts(0-2,3-5,6-8) and then merge them. I think that may be where the problem lies in.

import torch
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import ijson

user = pd.read_json('user.json')

user_idx = user['id']
uid_index = {uid: index for index, uid in enumerate(user_idx.values)}

# id_tweet={i:[] for i in range(len(user_idx))}

id_tweet = [{'index': i, 'tweets': []} for i in range(len(user_idx))]

for i in range(9):
    name = 'tweet_' + str(i) + '.json'
    if i < 3:
        user_tweets = ijson.items(open(name, 'r'), 'item')
    elif i < 6:
        user_tweets = ijson.items(open(name, 'r'), 'item')
    else:
        user_tweets = ijson.items(open(name, 'r'), 'item')
    print("load " + name + " succ")
    for each in tqdm(user_tweets):
        uid = 'u' + str(each['author_id'])
        text = each['text']
        try:
            index = uid_index[uid]

            id_tweet[index]['tweets'].append(text)
        except KeyError:
            continue
    if i == 2:
        json.dump(id_tweet, open('./id_tweet0.json', 'w'))
        id_tweet = [{'index': i, 'tweets': []} for i in range(len(user_idx))]
    if i == 5:
        json.dump(id_tweet, open('./id_tweet1.json', 'w'))
        id_tweet = [{'index': i, 'tweets': []} for i in range(len(user_idx))]

print("succ")
json.dump(id_tweet, open('./id_tweet2.json', 'w'))

@un0o7
Copy link
Author

un0o7 commented Apr 12, 2023

And then I merge these three parts into id_tweet.json.

import torch
import ijson
from transformers import pipeline
import json
from tqdm import tqdm

tweet1_path = "id_tweet0.json"
tweet2_path = "id_tweet1.json"
tweet3_path = "id_tweet2.json"

tweets1 = ijson.items(open(tweet1_path, 'r'), "item")
tweets2 = ijson.items(open(tweet2_path, 'r'), "item")
tweets3 = ijson.items(open(tweet3_path, 'r'), "item")
count = 0

id_tweet = {i: [] for i in range(250000)}
for i, (tweet1, tweet2,
        tweet3) in tqdm(enumerate(zip(tweets1, tweets2, tweets3))):
    if i % 250000 == 0 and i != 0:
        json.dump(id_tweet, open('./id_tweet' + str(count) + '.json', 'w'))
        id_tweet = {i: [] for i in range(i, i + 250000)}
        count += 1

    temp = tweet1['tweets'] + tweet2['tweets'] + tweet3['tweets']
    id_tweet[i].append(temp)

json.dump(id_tweet, open('./id_tweet' + str(count) + '.json', 'w'))

id_tweet = [[] for i in range(1000000)]
for i, (tweet1, tweet2,
        tweet3) in tqdm(enumerate(zip(tweets1, tweets2, tweets3))):
    temp = tweet1['tweets'] + tweet2['tweets'] + tweet3['tweets']
    temp = temp[:20]
    id_tweet[i].extend(temp)

json.dump(id_tweet, open('./id_tweet.json', 'w'))

@un0o7
Copy link
Author

un0o7 commented Apr 12, 2023

Last part is the same with yours.

import torch
from tqdm import tqdm
import numpy as np
from transformers import pipeline
import os
import pandas as pd
import json
import ijson

user_tweets = ijson.items(open("id_tweet.json", 'r'), 'item')

feature_extract = pipeline('feature-extraction',
                           model='roberta-base',
                           tokenizer='roberta-base',
                           device=1,
                           padding=True,
                           truncation=True,
                           max_length=50,
                           add_special_tokens=True)


def tweets_embedding():
    print('Running feature2 embedding')
    path = "./tweets_tensor.pt"
    if True:
        tweets_list = []
        for i, each_user_tweets in enumerate(user_tweets):
            if i % 1000 == 0:
                print(i)
            if len(each_user_tweets) == 0:
                total_each_person_tweets = torch.zeros(768)
            else:
                for j in range(len(each_user_tweets)):
                    each_tweet = each_user_tweets[j]
                    if each_tweet is None:
                        total_word_tensor = torch.zeros(768)
                    else:
                        each_tweet_tensor = torch.tensor(
                            feature_extract(each_tweet))
                        for k, each_word_tensor in enumerate(
                                each_tweet_tensor[0]):
                            if k == 0:
                                total_word_tensor = each_word_tensor
                            else:
                                total_word_tensor += each_word_tensor
                        total_word_tensor /= each_tweet_tensor.shape[1]
                    if j == 0:
                        total_each_person_tweets = total_word_tensor
                    elif j == 20:
                        break
                    else:
                        total_each_person_tweets += total_word_tensor
                if (j == 20):
                    total_each_person_tweets /= 20
                else:
                    total_each_person_tweets /= len(each_user_tweets)

            tweets_list.append(total_each_person_tweets)

        tweet_tensor = torch.stack(tweets_list)
        torch.save(tweet_tensor, path)

    else:
        tweets_tensor = torch.load(path)
    print('Finished')


tweets_embedding()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants