From 87ce06bbef49b0cb459d3df7a850439218ce469b Mon Sep 17 00:00:00 2001 From: Helmi Satria Date: Wed, 25 Oct 2023 22:54:53 +0700 Subject: [PATCH] feat: able to get data from tab/latest search tab --- CHANGELOG.md | 6 ++++++ package.json | 2 +- src/bin.ts | 7 +++++++ src/constants.ts | 4 ++++ src/crawl.ts | 17 +++++++++++------ src/dev.ts | 1 + 6 files changed, 30 insertions(+), 7 deletions(-) create mode 100644 src/constants.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 1211217..ec5113f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ ### Added +- Add `SEARCH_TAB` or `--search-tab` or `--tab` option to specify the tab to search for tweets. Default is `LATEST` tab. The options are `LATEST` and `TOP`. + +## 2.3.0 (2023-10-25) + +### Added + - Implemented optional exponential backoff for rate limit handling. The wait time between retries will now be calculated dynamically based on the number of attempts made, resulting in fewer requests during the rate-limit window. This should help to reduce the risk of account bans. To utilize this feature, set the `ENABLE_EXPONENTIAL_BACKOFF` environment variable to true. ### Changed diff --git a/package.json b/package.json index ff66899..ac9f5a6 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "tweet-harvest", "description": "A Twitter crawler helper with auth", - "version": "2.3.0", + "version": "2.4.0", "license": "MIT", "author": "Helmi Satria", "publishConfig": { diff --git a/src/bin.ts b/src/bin.ts index 1f9f807..696cdbc 100644 --- a/src/bin.ts +++ b/src/bin.ts @@ -58,6 +58,12 @@ async function run() { describe: "Output filename", type: "string", }, + search_tab: { + alias: "tab", + describe: "Search tab (TOP or LATEST)", + default: "LATEST", + choices: ["TOP", "LATEST"], + }, }) .help() .alias("help", "h").argv; @@ -145,6 +151,7 @@ async function run() { TARGET_TWEET_COUNT: argv.limit, DELAY_EACH_TWEET_SECONDS: argv.delay_each_tweet, OUTPUT_FILENAME: argv.output_filename, + SEARCH_TAB: String(argv.search_tab).toUpperCase() as "TOP" | "LATEST", }); } catch (err) { console.error("Error running script:", err); diff --git a/src/constants.ts b/src/constants.ts new file mode 100644 index 0000000..215a931 --- /dev/null +++ b/src/constants.ts @@ -0,0 +1,4 @@ +export const TWITTER_SEARCH_ADVANCED_URL = { + TOP: "https://twitter.com/search-advanced", + LATEST: "https://twitter.com/search-advanced", +}; diff --git a/src/crawl.ts b/src/crawl.ts index e3c79f5..7aaf788 100644 --- a/src/crawl.ts +++ b/src/crawl.ts @@ -10,6 +10,7 @@ import { inputKeywords } from "./features/input-keywords"; import { listenNetworkRequests } from "./features/listen-network-requests"; import { calculateForRateLimit } from "./features/exponential-backoff"; import { HEADLESS_MODE } from "./env"; +import { TWITTER_SEARCH_ADVANCED_URL } from "./constants"; chromium.use(stealth()); @@ -57,6 +58,7 @@ export async function crawl({ DELAY_EVERY_100_TWEETS_SECONDS = 10, DEBUG_MODE, OUTPUT_FILENAME, + SEARCH_TAB = "LATEST", }: { ACCESS_TOKEN: string; SEARCH_KEYWORDS?: string; @@ -68,8 +70,11 @@ export async function crawl({ DEBUG_MODE?: boolean; OUTPUT_FILENAME?: string; TWEET_THREAD_URL?: string; + SEARCH_TAB?: "LATEST" | "TOP"; }) { const CRAWL_MODE = TWEET_THREAD_URL ? "DETAIL" : "SEARCH"; + const SWITCHED_SEARCH_TAB = SEARCH_TAB === "TOP" ? "LATEST" : "TOP"; + const IS_DETAIL_MODE = CRAWL_MODE === "DETAIL"; const IS_SEARCH_MODE = CRAWL_MODE === "SEARCH"; const TIMEOUT_LIMIT = 4; @@ -94,7 +99,7 @@ export async function crawl({ fs.renameSync(FILE_NAME, FILE_NAME.replace(".csv", ".old.csv")); } - let TWEETS_NOT_FOUND_ON_LIVE_TAB = false; + let TWEETS_NOT_FOUND_ON_CURRENT_TAB = false; const browser = await chromium.launch({ headless: HEADLESS_MODE }); @@ -123,7 +128,7 @@ export async function crawl({ listenNetworkRequests(page); async function startCrawlTwitter({ - twitterSearchUrl = "https://twitter.com/search-advanced?f=live", + twitterSearchUrl = TWITTER_SEARCH_ADVANCED_URL[SEARCH_TAB], }: StartCrawlTwitterParams = {}) { if (IS_DETAIL_MODE) { await page.goto(TWEET_THREAD_URL); @@ -214,7 +219,7 @@ export async function crawl({ if (!tweets.length) { // found text "not found" on the page if (await page.getByText("No results for").count()) { - TWEETS_NOT_FOUND_ON_LIVE_TAB = true; + TWEETS_NOT_FOUND_ON_CURRENT_TAB = true; console.info("No tweets found for the search criteria"); break; } @@ -354,11 +359,11 @@ export async function crawl({ try { await startCrawlTwitter(); - if (TWEETS_NOT_FOUND_ON_LIVE_TAB && (SEARCH_FROM_DATE || SEARCH_TO_DATE)) { - console.info('No tweets found on "Latest" tab, trying "Top" tab...'); + if (TWEETS_NOT_FOUND_ON_CURRENT_TAB && (SEARCH_FROM_DATE || SEARCH_TO_DATE)) { + console.info(`No tweets found on "${SEARCH_TAB}" tab, trying "${SWITCHED_SEARCH_TAB}" tab...`); await startCrawlTwitter({ - twitterSearchUrl: "https://twitter.com/search-advanced", + twitterSearchUrl: TWITTER_SEARCH_ADVANCED_URL[SWITCHED_SEARCH_TAB], }); } } catch (error) { diff --git a/src/dev.ts b/src/dev.ts index f2dcfc7..fd639f4 100644 --- a/src/dev.ts +++ b/src/dev.ts @@ -9,4 +9,5 @@ crawl({ OUTPUT_FILENAME: "gibran.csv", DELAY_EACH_TWEET_SECONDS: 0.1, DELAY_EVERY_100_TWEETS_SECONDS: 0, + SEARCH_TAB: "TOP", });