Skip to content

Commit

Permalink
Merge pull request #16 from helmisatria/feat/get-data-from-top-search…
Browse files Browse the repository at this point in the history
…-tab

feat: able to get data from tab/latest search tab
  • Loading branch information
helmisatria authored Oct 25, 2023
2 parents a44809f + 87ce06b commit 69e3643
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 7 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@

### Added

- Add `SEARCH_TAB` or `--search-tab` or `--tab` option to specify the tab to search for tweets. Default is `LATEST` tab. The options are `LATEST` and `TOP`.

## 2.3.0 (2023-10-25)

### Added

- Implemented optional exponential backoff for rate limit handling. The wait time between retries will now be calculated dynamically based on the number of attempts made, resulting in fewer requests during the rate-limit window. This should help to reduce the risk of account bans. To utilize this feature, set the `ENABLE_EXPONENTIAL_BACKOFF` environment variable to true.

### Changed
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "tweet-harvest",
"description": "A Twitter crawler helper with auth",
"version": "2.3.0",
"version": "2.4.0",
"license": "MIT",
"author": "Helmi Satria",
"publishConfig": {
Expand Down
7 changes: 7 additions & 0 deletions src/bin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ async function run() {
describe: "Output filename",
type: "string",
},
search_tab: {
alias: "tab",
describe: "Search tab (TOP or LATEST)",
default: "LATEST",
choices: ["TOP", "LATEST"],
},
})
.help()
.alias("help", "h").argv;
Expand Down Expand Up @@ -145,6 +151,7 @@ async function run() {
TARGET_TWEET_COUNT: argv.limit,
DELAY_EACH_TWEET_SECONDS: argv.delay_each_tweet,
OUTPUT_FILENAME: argv.output_filename,
SEARCH_TAB: String(argv.search_tab).toUpperCase() as "TOP" | "LATEST",
});
} catch (err) {
console.error("Error running script:", err);
Expand Down
4 changes: 4 additions & 0 deletions src/constants.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
export const TWITTER_SEARCH_ADVANCED_URL = {
TOP: "https://twitter.com/search-advanced",
LATEST: "https://twitter.com/search-advanced",
};
17 changes: 11 additions & 6 deletions src/crawl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { inputKeywords } from "./features/input-keywords";
import { listenNetworkRequests } from "./features/listen-network-requests";
import { calculateForRateLimit } from "./features/exponential-backoff";
import { HEADLESS_MODE } from "./env";
import { TWITTER_SEARCH_ADVANCED_URL } from "./constants";

chromium.use(stealth());

Expand Down Expand Up @@ -57,6 +58,7 @@ export async function crawl({
DELAY_EVERY_100_TWEETS_SECONDS = 10,
DEBUG_MODE,
OUTPUT_FILENAME,
SEARCH_TAB = "LATEST",
}: {
ACCESS_TOKEN: string;
SEARCH_KEYWORDS?: string;
Expand All @@ -68,8 +70,11 @@ export async function crawl({
DEBUG_MODE?: boolean;
OUTPUT_FILENAME?: string;
TWEET_THREAD_URL?: string;
SEARCH_TAB?: "LATEST" | "TOP";
}) {
const CRAWL_MODE = TWEET_THREAD_URL ? "DETAIL" : "SEARCH";
const SWITCHED_SEARCH_TAB = SEARCH_TAB === "TOP" ? "LATEST" : "TOP";

const IS_DETAIL_MODE = CRAWL_MODE === "DETAIL";
const IS_SEARCH_MODE = CRAWL_MODE === "SEARCH";
const TIMEOUT_LIMIT = 4;
Expand All @@ -94,7 +99,7 @@ export async function crawl({
fs.renameSync(FILE_NAME, FILE_NAME.replace(".csv", ".old.csv"));
}

let TWEETS_NOT_FOUND_ON_LIVE_TAB = false;
let TWEETS_NOT_FOUND_ON_CURRENT_TAB = false;

const browser = await chromium.launch({ headless: HEADLESS_MODE });

Expand Down Expand Up @@ -123,7 +128,7 @@ export async function crawl({
listenNetworkRequests(page);

async function startCrawlTwitter({
twitterSearchUrl = "https://twitter.com/search-advanced?f=live",
twitterSearchUrl = TWITTER_SEARCH_ADVANCED_URL[SEARCH_TAB],
}: StartCrawlTwitterParams = {}) {
if (IS_DETAIL_MODE) {
await page.goto(TWEET_THREAD_URL);
Expand Down Expand Up @@ -214,7 +219,7 @@ export async function crawl({
if (!tweets.length) {
// found text "not found" on the page
if (await page.getByText("No results for").count()) {
TWEETS_NOT_FOUND_ON_LIVE_TAB = true;
TWEETS_NOT_FOUND_ON_CURRENT_TAB = true;
console.info("No tweets found for the search criteria");
break;
}
Expand Down Expand Up @@ -354,11 +359,11 @@ export async function crawl({
try {
await startCrawlTwitter();

if (TWEETS_NOT_FOUND_ON_LIVE_TAB && (SEARCH_FROM_DATE || SEARCH_TO_DATE)) {
console.info('No tweets found on "Latest" tab, trying "Top" tab...');
if (TWEETS_NOT_FOUND_ON_CURRENT_TAB && (SEARCH_FROM_DATE || SEARCH_TO_DATE)) {
console.info(`No tweets found on "${SEARCH_TAB}" tab, trying "${SWITCHED_SEARCH_TAB}" tab...`);

await startCrawlTwitter({
twitterSearchUrl: "https://twitter.com/search-advanced",
twitterSearchUrl: TWITTER_SEARCH_ADVANCED_URL[SWITCHED_SEARCH_TAB],
});
}
} catch (error) {
Expand Down
1 change: 1 addition & 0 deletions src/dev.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ crawl({
OUTPUT_FILENAME: "gibran.csv",
DELAY_EACH_TWEET_SECONDS: 0.1,
DELAY_EVERY_100_TWEETS_SECONDS: 0,
SEARCH_TAB: "TOP",
});

0 comments on commit 69e3643

Please sign in to comment.