diff --git a/test-data/tweets3.jsonl b/test-data/tweets3.jsonl new file mode 100644 index 0000000..d0405a1 --- /dev/null +++ b/test-data/tweets3.jsonl @@ -0,0 +1,2 @@ +{"data": [{"id": "1388203310327508995", "entities": {"annotations": [{"start": 47, "end": 49, "probability": 0.5491, "type": "Organization", "normalized_text": "DSS"}, {"start": 95, "end": 100, "probability": 0.4221, "type": "Place", "normalized_text": "41days"}], "hashtags": [{"start": 107, "end": 115, "tag": "EndSARS"}], "mentions": [{"start": 3, "end": 12, "username": "abjghost", "id": "707540943134711808"}, {"start": 14, "end": 30, "username": "imoleayomichael", "id": "927129038933626880"}]}, "possibly_sensitive": false, "conversation_id": "1388203310327508995", "lang": "en", "text": "RT @abjghost: @imoleayomichael was abducted by DSS at 2.30am in his residence and detained for 41days over #EndSARS protest. They still wan\u2026", "public_metrics": {"retweet_count": 65, "reply_count": 0, "like_count": 0, "quote_count": 0}, "source": "Twitter for iPhone", "reply_settings": "everyone", "referenced_tweets": [{"type": "retweeted", "id": "1388174000472432650"}], "author_id": "364212595", "created_at": "2021-04-30T18:47:15.000Z"}], "includes": {"users": [{"id": "364212595", "public_metrics": {"followers_count": 716, "following_count": 637, "tweet_count": 7430, "listed_count": 0}, "profile_image_url": "https://pbs.twimg.com/profile_images/1463823816249905152/P70DYQO9_normal.jpg", "verified": false, "username": "boo_chiiii", "name": "Dr B", "created_at": "2011-08-29T11:52:16.000Z", "description": "", "url": "", "protected": false}, {"id": "707540943134711808", "public_metrics": {"followers_count": 7331, "following_count": 7700, "tweet_count": 23616, "listed_count": 2}, "profile_image_url": "https://pbs.twimg.com/profile_images/1368867227240390657/DIEHzhPo_normal.jpg", "verified": false, "username": "abjghost", "name": "JagunJagun\ud83c\udf11", "pinned_tweet_id": "1342577248029827079", "created_at": "2016-03-09T12:18:01.000Z", "location": "Abuja, Nigeria", "description": "Abuja Connect|Digital Media Plug|Manchester United|Very Social", "url": "", "protected": false}, {"id": "927129038933626880", "public_metrics": {"followers_count": 9282, "following_count": 7714, "tweet_count": 27128, "listed_count": 9}, "profile_image_url": "https://pbs.twimg.com/profile_images/1342557547534966785/jp4DiwSW_normal.jpg", "verified": false, "username": "imoleayomichael", "name": "Imoleayo Michael \ud83d\udc51 {iCode}\ud83d\udd4a\ufe0f", "pinned_tweet_id": "1384780512791781376", "created_at": "2017-11-05T11:02:42.000Z", "location": "192.168.1.1", "entities": {"url": {"urls": [{"start": 0, "end": 23, "url": "https://t.co/9xD9Bb1Olp", "expanded_url": "http://icoderesources.com.ng", "display_url": "icoderesources.com.ng"}]}, "description": {"hashtags": [{"start": 52, "end": 63, "tag": "Programmer"}, {"start": 64, "end": 70, "tag": "Virgo"}, {"start": 71, "end": 77, "tag": "Coder"}]}}, "description": "CEO, iCode Resources | Tech Guru | Weirdo | Coder | #Programmer #Virgo #Coder", "url": "https://t.co/9xD9Bb1Olp", "protected": false}], "tweets": [{"id": "1388174000472432650", "entities": {"annotations": [{"start": 33, "end": 35, "probability": 0.4805, "type": "Organization", "normalized_text": "DSS"}, {"start": 81, "end": 86, "probability": 0.3244, "type": "Person", "normalized_text": "41days"}, {"start": 143, "end": 151, "probability": 0.4881, "type": "Person", "normalized_text": "Imoleayo"}], "urls": [{"start": 280, "end": 303, "url": "https://t.co/fDgTVvbQBZ", "expanded_url": "https://twitter.com/abjghost/status/1388174000472432650/photo/1", "display_url": "pic.twitter.com/fDgTVvbQBZ"}], "hashtags": [{"start": 93, "end": 101, "tag": "EndSARS"}, {"start": 224, "end": 237, "tag": "FreeImoleAyo"}], "mentions": [{"start": 0, "end": 16, "username": "imoleayomichael", "id": "927129038933626880"}]}, "possibly_sensitive": true, "attachments": {"media_keys": ["3_1388173984898879492"]}, "conversation_id": "1388174000472432650", "lang": "en", "text": "@imoleayomichael was abducted by DSS at 2.30am in his residence and detained for 41days over #EndSARS protest. They still want to convict him.\n\nImoleayo is a Programmer NOT A CRIMINAL!\n\nPls lend your voice in solidarity to \n#FreeImoleAyo\nIt could be you or me.\nPls tweet, RT, Tag https://t.co/fDgTVvbQBZ", "public_metrics": {"retweet_count": 65, "reply_count": 1, "like_count": 49, "quote_count": 0}, "context_annotations": [{"domain": {"id": "65", "name": "Interests and Hobbies Vertical", "description": "Top level interests and hobbies groupings, like Food or Travel"}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "848921413196984320", "name": "Computer programming", "description": "Computer programming"}}], "source": "Twitter for iPhone", "reply_settings": "everyone", "author_id": "707540943134711808", "in_reply_to_user_id": "927129038933626880", "created_at": "2021-04-30T16:50:47.000Z"}]}, "__twarc": {"url": "https://api.twitter.com/2/tweets?expansions=author_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id%2Centities.mentions.username%2Cattachments.poll_ids%2Cattachments.media_keys%2Cgeo.place_id&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Ctext%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Cwithheld&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&media.fields=alt_text%2Cduration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cpublic_metrics&poll.fields=duration_minutes%2Cend_datetime%2Cid%2Coptions%2Cvoting_status&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1388203310327508995", "version": "2.9.1", "retrieved_at": "2022-02-05T15:50:57+00:00"}} +{"data": [{"entities": {"annotations": [{"start": 33, "end": 35, "probability": 0.4805, "type": "Organization", "normalized_text": "DSS"}, {"start": 81, "end": 86, "probability": 0.3244, "type": "Person", "normalized_text": "41days"}, {"start": 143, "end": 151, "probability": 0.4881, "type": "Person", "normalized_text": "Imoleayo"}], "urls": [{"start": 280, "end": 303, "url": "https://t.co/fDgTVvbQBZ", "expanded_url": "https://twitter.com/abjghost/status/1388174000472432650/photo/1", "display_url": "pic.twitter.com/fDgTVvbQBZ"}], "hashtags": [{"start": 93, "end": 101, "tag": "EndSARS"}, {"start": 224, "end": 237, "tag": "FreeImoleAyo"}], "mentions": [{"start": 0, "end": 16, "username": "imoleayomichael", "id": "927129038933626880"}]}, "author_id": "707540943134711808", "conversation_id": "1388174000472432650", "source": "Twitter for iPhone", "in_reply_to_user_id": "927129038933626880", "possibly_sensitive": true, "id": "1388174000472432650", "attachments": {"media_keys": ["3_1388173984898879492"]}, "text": "@imoleayomichael was abducted by DSS at 2.30am in his residence and detained for 41days over #EndSARS protest. They still want to convict him.\n\nImoleayo is a Programmer NOT A CRIMINAL!\n\nPls lend your voice in solidarity to \n#FreeImoleAyo\nIt could be you or me.\nPls tweet, RT, Tag https://t.co/fDgTVvbQBZ", "reply_settings": "everyone", "public_metrics": {"retweet_count": 65, "reply_count": 1, "like_count": 49, "quote_count": 0}, "lang": "en", "context_annotations": [{"domain": {"id": "65", "name": "Interests and Hobbies Vertical", "description": "Top level interests and hobbies groupings, like Food or Travel"}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "848921413196984320", "name": "Computer programming", "description": "Computer programming"}}], "created_at": "2021-04-30T16:50:47.000Z"}], "includes": {"media": [{"media_key": "3_1388173984898879492", "height": 810, "url": "https://pbs.twimg.com/media/E0PI4mhWEAQPAzY.jpg", "type": "photo", "width": 1440}], "users": [{"id": "707540943134711808", "url": "", "name": "JagunJagun\ud83c\udf11", "protected": false, "public_metrics": {"followers_count": 7331, "following_count": 7700, "tweet_count": 23616, "listed_count": 2}, "pinned_tweet_id": "1342577248029827079", "description": "Abuja Connect|Digital Media Plug|Manchester United|Very Social", "created_at": "2016-03-09T12:18:01.000Z", "verified": false, "username": "abjghost", "location": "Abuja, Nigeria", "profile_image_url": "https://pbs.twimg.com/profile_images/1368867227240390657/DIEHzhPo_normal.jpg"}, {"id": "927129038933626880", "url": "https://t.co/9xD9Bb1Olp", "name": "Imoleayo Michael \ud83d\udc51 {iCode}\ud83d\udd4a\ufe0f", "protected": false, "public_metrics": {"followers_count": 9282, "following_count": 7714, "tweet_count": 27128, "listed_count": 9}, "pinned_tweet_id": "1384780512791781376", "entities": {"url": {"urls": [{"start": 0, "end": 23, "url": "https://t.co/9xD9Bb1Olp", "expanded_url": "http://icoderesources.com.ng", "display_url": "icoderesources.com.ng"}]}, "description": {"hashtags": [{"start": 52, "end": 63, "tag": "Programmer"}, {"start": 64, "end": 70, "tag": "Virgo"}, {"start": 71, "end": 77, "tag": "Coder"}]}}, "description": "CEO, iCode Resources | Tech Guru | Weirdo | Coder | #Programmer #Virgo #Coder", "created_at": "2017-11-05T11:02:42.000Z", "verified": false, "username": "imoleayomichael", "location": "192.168.1.1", "profile_image_url": "https://pbs.twimg.com/profile_images/1342557547534966785/jp4DiwSW_normal.jpg"}]}, "__twarc": {"url": "https://api.twitter.com/2/tweets?expansions=author_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id%2Centities.mentions.username%2Cattachments.poll_ids%2Cattachments.media_keys%2Cgeo.place_id&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Ctext%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Cwithheld&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&media.fields=alt_text%2Cduration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cpublic_metrics&poll.fields=duration_minutes%2Cend_datetime%2Cid%2Coptions%2Cvoting_status&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1388174000472432650", "version": "2.9.1", "retrieved_at": "2022-02-05T15:51:22+00:00"}} diff --git a/test_twarc_hashtags.py b/test_twarc_hashtags.py index f6037cd..cd49c3e 100644 --- a/test_twarc_hashtags.py +++ b/test_twarc_hashtags.py @@ -3,41 +3,71 @@ runner = CliRunner() + def test_basic(): - result = runner.invoke(hashtags, ['test-data/tweets1.jsonl']) + result = runner.invoke(hashtags, ["test-data/tweets1.jsonl"]) assert result.exit_code == 0 - assert result.output == \ -'''hashtag,tweets + assert ( + result.output + == """hashtag,tweets +gettyimagesnews,2 naacpimageawards,2 -usa,1 -trafficking,1 -refugees,1 -obama,1 -isis,1 -illegals,1 -cartel,1 +savetheseabirds,2 biden,1 -''' +boulder,1 +cartel,1 +forthepeopleact,1 +guncontrolnow,1 +illegals,1 +isis,1 +islarebelde,1 +nft,1 +niunamenos,1 +nra,1 +obama,1 +patriaomuerte,1 +refugees,1 +sethrich,1 +trafficking,1 +usa,1 +""" + ) + def test_group(): - result = runner.invoke(hashtags, ['--group', 'day', 'test-data/tweets2.jsonl']) + result = runner.invoke(hashtags, ["--group", "day", "test-data/tweets2.jsonl"]) assert result.exit_code == 0 assert result.output.startswith( -'''hashtag,time,tweets + """hashtag,time,tweets +ethereum,2021-08-20,13 bitcoin,2021-08-20,10 -reinstate45,2021-08-20,5 -hypocrisy,2021-08-20,5 fbi,2021-08-20,5 -''') +hypocrisy,2021-08-20,5 +""" + ) + def test_limit(): - result = runner.invoke(hashtags, ['--limit', '5', 'test-data/tweets2.jsonl']) + result = runner.invoke(hashtags, ["--limit", "5", "test-data/tweets2.jsonl"]) assert result.exit_code == 0 - assert result.output == \ -'''hashtag,tweets -wtfhappenedin1971,389 -bitcoin,372 -banliznowjack,230 -btc,113 -farmersprotest,111 -''' + assert ( + result.output + == """hashtag,tweets +banliznowjack,650 +wtfhappenedin1971,599 +bitcoin,469 +farmersprotest,128 +btc,121 +""" + ) + + +def test_retweets(): + result = runner.invoke(hashtags, ["test-data/tweets3.jsonl"]) + assert result.exit_code == 0 + assert result.output.startswith( + """hashtag,tweets +endsars,2 +freeimoleayo,2 +""" + ) diff --git a/twarc_hashtags.py b/twarc_hashtags.py index 494a72e..9e7d9a7 100644 --- a/twarc_hashtags.py +++ b/twarc_hashtags.py @@ -6,32 +6,29 @@ from twarc.expansions import ensure_flattened from twarc.decorators2 import FileSizeProgressBar + @click.command() @click.option( - "--group", + "--group", "-g", type=click.Choice(["day", "week", "month", "year"]), - help="Group hashtag results by time" + help="Group hashtag results by time", ) @click.option( "--db", "-d", "db_path", default="hashtags.db", - help="Path to use for the SQLite database" + help="Path to use for the SQLite database", ) @click.option( "--skip-import", "-s", is_flag=True, - help="Skip loading the tweets and use existing SQLite database" + help="Skip loading the tweets and use existing SQLite database", ) @click.option( - "--limit", - "-l", - type=int, - default=0, - help="Limit output to this many hashtags" + "--limit", "-l", type=int, default=0, help="Limit output to this many hashtags" ) @click.argument("infile", type=click.File("r"), default="-") @click.argument("outfile", type=click.File("w"), default="-") @@ -51,7 +48,7 @@ def hashtags(group, db_path, limit, skip_import, infile, outfile): def load(infile, outfile, db): - with FileSizeProgressBar(infile, outfile) as progress: + with FileSizeProgressBar(infile, outfile) as progress: for line in infile: progress.update(len(line)) @@ -62,6 +59,33 @@ def load(infile, outfile, db): data = json.loads(line) for tweet in ensure_flattened(data): + # Process Retweets: + if "referenced_tweets" in tweet: + rts = [ + t + for t in tweet["referenced_tweets"] + if t["type"] == "retweeted" + ] + retweeted_tweet = rts[-1] if rts else None + # If it's a native retweet, replace the "RT @user Text" with the original text, metrics, and entities, but keep the Author. + if retweeted_tweet: + # A retweet inherits everything from retweeted tweet. + tweet["text"] = retweeted_tweet.pop( + "text", tweet.pop("text", None) + ) + tweet["entities"] = retweeted_tweet.pop( + "entities", tweet.pop("entities", None) + ) + tweet["attachments"] = retweeted_tweet.pop( + "attachments", tweet.pop("attachments", None) + ) + tweet["context_annotations"] = retweeted_tweet.pop( + "context_annotations", + tweet.pop("context_annotations", None), + ) + tweet["public_metrics"] = retweeted_tweet.pop( + "public_metrics", tweet.pop("public_metrics", None) + ) if "entities" in tweet and "hashtags" in tweet["entities"]: for hashtag in tweet["entities"]["hashtags"]: db.execute( @@ -69,13 +93,9 @@ def load(infile, outfile, db): INSERT INTO hashtags (id, created, hashtag) VALUES (?, ?, ?) """, - ( - tweet["id"], - tweet["created_at"], - hashtag["tag"].lower() - ) + (tweet["id"], tweet["created_at"], hashtag["tag"].lower()), ) - + db.commit() @@ -97,8 +117,7 @@ def export(outfile, db, group, limit): elif group == "year": fmt = "%Y" - sql = \ - """ + sql = """ SELECT hashtag, STRFTIME(?, created) AS time, @@ -112,8 +131,7 @@ def export(outfile, db, group, limit): # otherwise we're doing a global count else: - sql = \ - """ + sql = """ SELECT hashtag, COUNT(*) AS tweets FROM HASHTAGS GROUP BY hashtag