diff --git a/CHANGELOG.md b/CHANGELOG.md index ec5113f..c205766 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,22 @@ # Changelog -## 2.3.0 (2023-10-25) +## 2.4.2 (2023-11-26) + +### Changes + +- Add image_url to the output CSV file (if exists). +- Add location to the output CSV file (if exists). + +## 2.4.1 (2023-11-26) + +### Changes + +- Fixed inconsistent delimiter and CSV formatting in crawl functionality. +- The delimiter has been standardized to use commas consistently throughout the CSV file. +- Ensured proper conversion of object values to strings in the crawl functionality. +- Improved CSV formatting and enhanced reliability of data extraction from Twitter data. + +## 2.4.0 (2023-10-25) ### Added diff --git a/package.json b/package.json index ac9f5a6..63d0416 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "tweet-harvest", "description": "A Twitter crawler helper with auth", - "version": "2.4.0", + "version": "2.4.2", "license": "MIT", "author": "Helmi Satria", "publishConfig": { @@ -10,7 +10,7 @@ }, "repository": { "type": "git", - "url": "https://github.com/helmisatria/tweet-harvest.git" + "url": "git+https://github.com/helmisatria/tweet-harvest.git" }, "files": [ "dist/", @@ -39,7 +39,7 @@ "typescript": "*" }, "bin": { - "tweet-harvest": "./dist/bin.js" + "tweet-harvest": "dist/bin.js" }, "scripts": { "start": "ts-node src/crawl.ts", diff --git a/src/crawl.ts b/src/crawl.ts index 7aaf788..6a408eb 100644 --- a/src/crawl.ts +++ b/src/crawl.ts @@ -40,12 +40,26 @@ const filteredFields = [ "conversation_id_str", "username", "tweet_url", + "image_url", + "location", ]; type StartCrawlTwitterParams = { twitterSearchUrl?: string; }; +function convertValuesToStrings(obj) { + const result = {}; + for (const key in obj) { + if (typeof obj[key] === "object" && obj[key] !== null) { + result[key] = convertValuesToStrings(obj[key]); // Recursively convert nested object values + } else { + result[key] = `"${String(obj[key])}"`; + } + } + return result; +} + export async function crawl({ ACCESS_TOKEN, SEARCH_KEYWORDS, @@ -225,7 +239,7 @@ export async function crawl({ } } - const headerRow = filteredFields.join(";") + "\n"; + const headerRow = filteredFields.map((field) => `"${field}"`).join(",") + "\n"; if (!headerWritten) { headerWritten = true; @@ -279,7 +293,7 @@ export async function crawl({ const rows = comingTweets.reduce((prev: [], current: (typeof tweetContents)[0]) => { const tweet = pick(current.tweet, filteredFields); - let cleanTweetText = `"${tweet.full_text.replace(/;/g, " ").replace(/\n/g, " ")}"`; + let cleanTweetText = `${tweet.full_text.replace(/,/g, " ").replace(/\n/g, " ")}`; if (IS_DETAIL_MODE) { const firstWord = cleanTweetText.split(" ")[0]; @@ -294,8 +308,10 @@ export async function crawl({ tweet["full_text"] = cleanTweetText; tweet["username"] = current.user.screen_name; tweet["tweet_url"] = `https://twitter.com/${current.user.screen_name}/status/${tweet.id_str}`; + tweet["image_url"] = current.tweet.entities?.media?.[0]?.media_url_https || ""; + tweet["location"] = current.user.location || ""; - const row = Object.values(tweet).join(";"); + const row = Object.values(convertValuesToStrings(tweet)).join(","); return [...prev, row]; }, []); diff --git a/src/types/tweets.types.ts b/src/types/tweets.types.ts index 806956c..c65ee44 100644 --- a/src/types/tweets.types.ts +++ b/src/types/tweets.types.ts @@ -29,6 +29,7 @@ interface TweetContent { conversation_id_str: string; display_text_range: number[]; entities: { + media: any[]; user_mentions: any[]; urls: any[]; hashtags: any[];