Skip to content

Commit

Permalink
Merge pull request #20 from helmisatria/feat/support-gephi-format
Browse files Browse the repository at this point in the history
feat: support gephi format
  • Loading branch information
helmisatria authored Jan 27, 2024
2 parents e973db9 + 9f6364d commit 007e35d
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 35 deletions.
Binary file added .DS_Store
Binary file not shown.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog

## 2.5.3 (2024-01-27)

### Changes

- Fix: consistency of csv headers order
- Feat: convert tweet-harvest csv to gephi format source,target

## 2.5.0 (2024-01-19)

### Changes
Expand Down
7 changes: 5 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "tweet-harvest",
"description": "A Twitter crawler helper with auth",
"version": "2.5.2",
"version": "2.5.3",
"license": "MIT",
"author": "Helmi Satria",
"publishConfig": {
Expand All @@ -24,6 +24,7 @@
"dotenv": "^16.4.1",
"lodash": "^4.17.21",
"minimist": "^1.2.8",
"papaparse": "^5.4.1",
"playwright-extra": "^4.3.6",
"prompts": "^2.4.2",
"puppeteer-extra-plugin-stealth": "^2.11.2",
Expand All @@ -34,6 +35,7 @@
"devDependencies": {
"@swc/core": "^1.3.106",
"@types/lodash": "^4.14.202",
"@types/papaparse": "^5.3.14",
"@types/prompts": "^2.4.9",
"@types/yargs": "^17.0.32",
"pkg": "^5.8.1",
Expand All @@ -43,7 +45,8 @@
"typescript": "^5.3.3"
},
"bin": {
"tweet-harvest": "dist/bin.js"
"tweet-harvest": "dist/bin.js",
"tweet-harvest-gephi": "dist/scripts/convert-source-target.js"
},
"scripts": {
"start": "ts-node src/crawl.ts",
Expand Down
16 changes: 16 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ export const FILTERED_FIELDS = [
"conversation_id_str",
"username",
"tweet_url",
"in_reply_to_screen_name",
"image_url",
"location",
"in_reply_to_screen_name",
];
50 changes: 19 additions & 31 deletions src/crawl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,26 @@ import {
} from "./constants";
import { CACHE_KEYS, cache } from "./cache";
import { logError, scrollDown } from "./helpers/page.helper";
import Papa from "papaparse";
import _ from "lodash";

chromium.use(stealth());

let headerWritten = false;

function appendCsv(pathStr: string, contents: any, cb?) {
const dirName = path.dirname(pathStr);
function appendCsv(pathStr: string, jsonData: Record<string, any>[]) {
const fileName = path.resolve(pathStr);

fs.mkdirSync(dirName, { recursive: true });
fs.appendFileSync(fileName, contents, cb);
const csv = Papa.unparse(jsonData, {
quotes: true, // Wrap every datum in quotes
header: !headerWritten, // Write header only if it's not written yet
skipEmptyLines: true, // Don't write empty lines
});

headerWritten = true; // Set header as written

fs.appendFileSync(fileName, csv);
fs.appendFileSync(fileName, "\r\n");

return fileName;
}
Expand All @@ -37,18 +46,6 @@ type StartCrawlTwitterParams = {
twitterSearchUrl?: string;
};

function convertValuesToStrings(obj) {
const result = {};
for (const key in obj) {
if (typeof obj[key] === "object" && obj[key] !== null) {
result[key] = convertValuesToStrings(obj[key]); // Recursively convert nested object values
} else {
result[key] = `"${String(obj[key])}"`;
}
}
return result;
}

export type CrawlParams = {
ACCESS_TOKEN: string;
SEARCH_KEYWORDS?: string;
Expand Down Expand Up @@ -232,14 +229,6 @@ export async function crawl({

cache.set(CACHE_KEYS.GOT_TWEETS, true);

const headerRow = FILTERED_FIELDS.map((field) => `"${field}"`).join(",") + "\n";
const isAlreadyHaveHeader = fs.existsSync(FILE_NAME);

if (!headerWritten) {
headerWritten = true;
if (!isAlreadyHaveHeader) appendCsv(FILE_NAME, headerRow);
}

const tweetContents = tweets
.map((tweet) => {
const isPromotedTweet = tweet.entryId.includes("promoted");
Expand Down Expand Up @@ -284,7 +273,7 @@ export async function crawl({
console.info(chalk.green(`Created new directory: ${dirFullPath}`));
}

const rows = comingTweets.reduce((prev: [], current: (typeof tweetContents)[0]) => {
const rows = comingTweets.map((current: (typeof tweetContents)[0]) => {
const tweet = pick(current.tweet, FILTERED_FIELDS);

const charsToReplace = ["\n", ",", '"', "⁦", "⁩", "’", "‘", "“", "”", "…", "—", "–", "•"];
Expand Down Expand Up @@ -316,15 +305,14 @@ export async function crawl({
tweet["tweet_url"] = `https://twitter.com/${current.user.screen_name}/status/${tweet.id_str}`;
tweet["image_url"] = current.tweet.entities?.media?.[0]?.media_url_https || "";
tweet["location"] = current.user.location || "";
tweet["reply_to_username"] = current.tweet.in_reply_to_screen_name || "";
tweet["in_reply_to_screen_name"] = current.tweet.in_reply_to_screen_name || "";

const row = Object.values(convertValuesToStrings(tweet)).join(",");
return tweet;
});

return [...prev, row];
}, []);
const sortedArrayOfObjects = _.map(rows, (obj) => _.fromPairs(_.sortBy(Object.entries(obj), 0)));

const csv = (rows as []).join("\n") + "\n";
const fullPathFilename = appendCsv(FILE_NAME, csv);
const fullPathFilename = appendCsv(FILE_NAME, sortedArrayOfObjects);

console.info(chalk.blue(`\n\nYour tweets saved to: ${fullPathFilename}`));

Expand Down
66 changes: 66 additions & 0 deletions src/scripts/convert-source-target.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env node

import * as fs from "fs";
import * as Papa from "papaparse";
import { program } from "commander";
import { pick } from "lodash";

interface InputRow {
username: string;
in_reply_to_screen_name: string;
}

interface OutputRow {
source: string;
target: string;
}

function readCSV(filePath: string): Promise<InputRow[]> {
return new Promise((resolve, reject) => {
const fileContent = fs.readFileSync(filePath, "utf8");
Papa.parse(fileContent, {
header: true,
complete: (result) => {
const data = (result.data.map((d) => pick(d, ["username", "in_reply_to_screen_name"])) as InputRow[]).filter(
(d) => d.username || d.in_reply_to_screen_name
);
resolve(data);
},
error: (error) => reject(error),
});
});
}

function writeCSV(filePath: string, data: OutputRow[]): void {
const csv = Papa.unparse(data, {
columns: ["source", "target"],
delimiter: ",",
header: true,
quotes: true,
});
fs.writeFileSync(filePath, csv, "utf8");
console.log(`CSV file was written successfully to ${filePath}`);
}

async function transformCSV(inputFilePath: string, outputFilePath: string) {
try {
const inputData = await readCSV(inputFilePath);
const outputData: OutputRow[] = inputData.map((row) => ({
source: row.username,
target: row.in_reply_to_screen_name || "",
}));

writeCSV(outputFilePath, outputData);
} catch (error) {
console.error("Error processing CSV file:", error);
}
}

program
.requiredOption("-i, --input <path>", "Input CSV file path")
.requiredOption("-o, --output <path>", "Output CSV file path");

program.parse(process.argv);

const options = program.opts();
transformCSV(options.input, options.output);
2 changes: 1 addition & 1 deletion tsup.config.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { defineConfig } from "tsup";

export default defineConfig({
entry: ["src/bin.ts"],
entry: ["src/bin.ts", "src/scripts/convert-source-target.ts"],
splitting: true,
});

0 comments on commit 007e35d

Please sign in to comment.