Skip to content

Commit

Permalink
Merge pull request #17 from helmisatria/fix/delimiter
Browse files Browse the repository at this point in the history
fix/delimiter
  • Loading branch information
helmisatria authored Nov 26, 2023
2 parents 69e3643 + 882315a commit 74d54cf
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 7 deletions.
18 changes: 17 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,22 @@
# Changelog

## 2.3.0 (2023-10-25)
## 2.4.2 (2023-11-26)

### Changes

- Add image_url to the output CSV file (if exists).
- Add location to the output CSV file (if exists).

## 2.4.1 (2023-11-26)

### Changes

- Fixed inconsistent delimiter and CSV formatting in crawl functionality.
- The delimiter has been standardized to use commas consistently throughout the CSV file.
- Ensured proper conversion of object values to strings in the crawl functionality.
- Improved CSV formatting and enhanced reliability of data extraction from Twitter data.

## 2.4.0 (2023-10-25)

### Added

Expand Down
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "tweet-harvest",
"description": "A Twitter crawler helper with auth",
"version": "2.4.0",
"version": "2.4.2",
"license": "MIT",
"author": "Helmi Satria",
"publishConfig": {
Expand All @@ -10,7 +10,7 @@
},
"repository": {
"type": "git",
"url": "https://github.com/helmisatria/tweet-harvest.git"
"url": "git+https://github.com/helmisatria/tweet-harvest.git"
},
"files": [
"dist/",
Expand Down Expand Up @@ -39,7 +39,7 @@
"typescript": "*"
},
"bin": {
"tweet-harvest": "./dist/bin.js"
"tweet-harvest": "dist/bin.js"
},
"scripts": {
"start": "ts-node src/crawl.ts",
Expand Down
22 changes: 19 additions & 3 deletions src/crawl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,26 @@ const filteredFields = [
"conversation_id_str",
"username",
"tweet_url",
"image_url",
"location",
];

type StartCrawlTwitterParams = {
twitterSearchUrl?: string;
};

function convertValuesToStrings(obj) {
const result = {};
for (const key in obj) {
if (typeof obj[key] === "object" && obj[key] !== null) {
result[key] = convertValuesToStrings(obj[key]); // Recursively convert nested object values
} else {
result[key] = `"${String(obj[key])}"`;
}
}
return result;
}

export async function crawl({
ACCESS_TOKEN,
SEARCH_KEYWORDS,
Expand Down Expand Up @@ -225,7 +239,7 @@ export async function crawl({
}
}

const headerRow = filteredFields.join(";") + "\n";
const headerRow = filteredFields.map((field) => `"${field}"`).join(",") + "\n";

if (!headerWritten) {
headerWritten = true;
Expand Down Expand Up @@ -279,7 +293,7 @@ export async function crawl({
const rows = comingTweets.reduce((prev: [], current: (typeof tweetContents)[0]) => {
const tweet = pick(current.tweet, filteredFields);

let cleanTweetText = `"${tweet.full_text.replace(/;/g, " ").replace(/\n/g, " ")}"`;
let cleanTweetText = `${tweet.full_text.replace(/,/g, " ").replace(/\n/g, " ")}`;

if (IS_DETAIL_MODE) {
const firstWord = cleanTweetText.split(" ")[0];
Expand All @@ -294,8 +308,10 @@ export async function crawl({
tweet["full_text"] = cleanTweetText;
tweet["username"] = current.user.screen_name;
tweet["tweet_url"] = `https://twitter.com/${current.user.screen_name}/status/${tweet.id_str}`;
tweet["image_url"] = current.tweet.entities?.media?.[0]?.media_url_https || "";
tweet["location"] = current.user.location || "";

const row = Object.values(tweet).join(";");
const row = Object.values(convertValuesToStrings(tweet)).join(",");

return [...prev, row];
}, []);
Expand Down
1 change: 1 addition & 0 deletions src/types/tweets.types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ interface TweetContent {
conversation_id_str: string;
display_text_range: number[];
entities: {
media: any[];
user_mentions: any[];
urls: any[];
hashtags: any[];
Expand Down

0 comments on commit 74d54cf

Please sign in to comment.