-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Change counting hashtags in retweets #2
base: main
Are you sure you want to change the base?
Changes from all commits
8f39b74
4575a69
e3adcd9
91804da
4687b26
666ae4b
c04c8fc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"data": [{"id": "1388203310327508995", "entities": {"annotations": [{"start": 47, "end": 49, "probability": 0.5491, "type": "Organization", "normalized_text": "DSS"}, {"start": 95, "end": 100, "probability": 0.4221, "type": "Place", "normalized_text": "41days"}], "hashtags": [{"start": 107, "end": 115, "tag": "EndSARS"}], "mentions": [{"start": 3, "end": 12, "username": "abjghost", "id": "707540943134711808"}, {"start": 14, "end": 30, "username": "imoleayomichael", "id": "927129038933626880"}]}, "possibly_sensitive": false, "conversation_id": "1388203310327508995", "lang": "en", "text": "RT @abjghost: @imoleayomichael was abducted by DSS at 2.30am in his residence and detained for 41days over #EndSARS protest. They still wan\u2026", "public_metrics": {"retweet_count": 65, "reply_count": 0, "like_count": 0, "quote_count": 0}, "source": "Twitter for iPhone", "reply_settings": "everyone", "referenced_tweets": [{"type": "retweeted", "id": "1388174000472432650"}], "author_id": "364212595", "created_at": "2021-04-30T18:47:15.000Z"}], "includes": {"users": [{"id": "364212595", "public_metrics": {"followers_count": 716, "following_count": 637, "tweet_count": 7430, "listed_count": 0}, "profile_image_url": "https://pbs.twimg.com/profile_images/1463823816249905152/P70DYQO9_normal.jpg", "verified": false, "username": "boo_chiiii", "name": "Dr B", "created_at": "2011-08-29T11:52:16.000Z", "description": "", "url": "", "protected": false}, {"id": "707540943134711808", "public_metrics": {"followers_count": 7331, "following_count": 7700, "tweet_count": 23616, "listed_count": 2}, "profile_image_url": "https://pbs.twimg.com/profile_images/1368867227240390657/DIEHzhPo_normal.jpg", "verified": false, "username": "abjghost", "name": "JagunJagun\ud83c\udf11", "pinned_tweet_id": "1342577248029827079", "created_at": "2016-03-09T12:18:01.000Z", "location": "Abuja, Nigeria", "description": "Abuja Connect|Digital Media Plug|Manchester United|Very Social", "url": "", "protected": false}, {"id": "927129038933626880", "public_metrics": {"followers_count": 9282, "following_count": 7714, "tweet_count": 27128, "listed_count": 9}, "profile_image_url": "https://pbs.twimg.com/profile_images/1342557547534966785/jp4DiwSW_normal.jpg", "verified": false, "username": "imoleayomichael", "name": "Imoleayo Michael \ud83d\udc51 {iCode}\ud83d\udd4a\ufe0f", "pinned_tweet_id": "1384780512791781376", "created_at": "2017-11-05T11:02:42.000Z", "location": "192.168.1.1", "entities": {"url": {"urls": [{"start": 0, "end": 23, "url": "https://t.co/9xD9Bb1Olp", "expanded_url": "http://icoderesources.com.ng", "display_url": "icoderesources.com.ng"}]}, "description": {"hashtags": [{"start": 52, "end": 63, "tag": "Programmer"}, {"start": 64, "end": 70, "tag": "Virgo"}, {"start": 71, "end": 77, "tag": "Coder"}]}}, "description": "CEO, iCode Resources | Tech Guru | Weirdo | Coder | #Programmer #Virgo #Coder", "url": "https://t.co/9xD9Bb1Olp", "protected": false}], "tweets": [{"id": "1388174000472432650", "entities": {"annotations": [{"start": 33, "end": 35, "probability": 0.4805, "type": "Organization", "normalized_text": "DSS"}, {"start": 81, "end": 86, "probability": 0.3244, "type": "Person", "normalized_text": "41days"}, {"start": 143, "end": 151, "probability": 0.4881, "type": "Person", "normalized_text": "Imoleayo"}], "urls": [{"start": 280, "end": 303, "url": "https://t.co/fDgTVvbQBZ", "expanded_url": "https://twitter.com/abjghost/status/1388174000472432650/photo/1", "display_url": "pic.twitter.com/fDgTVvbQBZ"}], "hashtags": [{"start": 93, "end": 101, "tag": "EndSARS"}, {"start": 224, "end": 237, "tag": "FreeImoleAyo"}], "mentions": [{"start": 0, "end": 16, "username": "imoleayomichael", "id": "927129038933626880"}]}, "possibly_sensitive": true, "attachments": {"media_keys": ["3_1388173984898879492"]}, "conversation_id": "1388174000472432650", "lang": "en", "text": "@imoleayomichael was abducted by DSS at 2.30am in his residence and detained for 41days over #EndSARS protest. They still want to convict him.\n\nImoleayo is a Programmer NOT A CRIMINAL!\n\nPls lend your voice in solidarity to \n#FreeImoleAyo\nIt could be you or me.\nPls tweet, RT, Tag https://t.co/fDgTVvbQBZ", "public_metrics": {"retweet_count": 65, "reply_count": 1, "like_count": 49, "quote_count": 0}, "context_annotations": [{"domain": {"id": "65", "name": "Interests and Hobbies Vertical", "description": "Top level interests and hobbies groupings, like Food or Travel"}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "848921413196984320", "name": "Computer programming", "description": "Computer programming"}}], "source": "Twitter for iPhone", "reply_settings": "everyone", "author_id": "707540943134711808", "in_reply_to_user_id": "927129038933626880", "created_at": "2021-04-30T16:50:47.000Z"}]}, "__twarc": {"url": "https://api.twitter.com/2/tweets?expansions=author_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id%2Centities.mentions.username%2Cattachments.poll_ids%2Cattachments.media_keys%2Cgeo.place_id&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Ctext%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Cwithheld&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&media.fields=alt_text%2Cduration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cpublic_metrics&poll.fields=duration_minutes%2Cend_datetime%2Cid%2Coptions%2Cvoting_status&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1388203310327508995", "version": "2.9.1", "retrieved_at": "2022-02-05T15:50:57+00:00"}} | ||
{"data": [{"entities": {"annotations": [{"start": 33, "end": 35, "probability": 0.4805, "type": "Organization", "normalized_text": "DSS"}, {"start": 81, "end": 86, "probability": 0.3244, "type": "Person", "normalized_text": "41days"}, {"start": 143, "end": 151, "probability": 0.4881, "type": "Person", "normalized_text": "Imoleayo"}], "urls": [{"start": 280, "end": 303, "url": "https://t.co/fDgTVvbQBZ", "expanded_url": "https://twitter.com/abjghost/status/1388174000472432650/photo/1", "display_url": "pic.twitter.com/fDgTVvbQBZ"}], "hashtags": [{"start": 93, "end": 101, "tag": "EndSARS"}, {"start": 224, "end": 237, "tag": "FreeImoleAyo"}], "mentions": [{"start": 0, "end": 16, "username": "imoleayomichael", "id": "927129038933626880"}]}, "author_id": "707540943134711808", "conversation_id": "1388174000472432650", "source": "Twitter for iPhone", "in_reply_to_user_id": "927129038933626880", "possibly_sensitive": true, "id": "1388174000472432650", "attachments": {"media_keys": ["3_1388173984898879492"]}, "text": "@imoleayomichael was abducted by DSS at 2.30am in his residence and detained for 41days over #EndSARS protest. They still want to convict him.\n\nImoleayo is a Programmer NOT A CRIMINAL!\n\nPls lend your voice in solidarity to \n#FreeImoleAyo\nIt could be you or me.\nPls tweet, RT, Tag https://t.co/fDgTVvbQBZ", "reply_settings": "everyone", "public_metrics": {"retweet_count": 65, "reply_count": 1, "like_count": 49, "quote_count": 0}, "lang": "en", "context_annotations": [{"domain": {"id": "65", "name": "Interests and Hobbies Vertical", "description": "Top level interests and hobbies groupings, like Food or Travel"}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "848921413196984320", "name": "Computer programming", "description": "Computer programming"}}], "created_at": "2021-04-30T16:50:47.000Z"}], "includes": {"media": [{"media_key": "3_1388173984898879492", "height": 810, "url": "https://pbs.twimg.com/media/E0PI4mhWEAQPAzY.jpg", "type": "photo", "width": 1440}], "users": [{"id": "707540943134711808", "url": "", "name": "JagunJagun\ud83c\udf11", "protected": false, "public_metrics": {"followers_count": 7331, "following_count": 7700, "tweet_count": 23616, "listed_count": 2}, "pinned_tweet_id": "1342577248029827079", "description": "Abuja Connect|Digital Media Plug|Manchester United|Very Social", "created_at": "2016-03-09T12:18:01.000Z", "verified": false, "username": "abjghost", "location": "Abuja, Nigeria", "profile_image_url": "https://pbs.twimg.com/profile_images/1368867227240390657/DIEHzhPo_normal.jpg"}, {"id": "927129038933626880", "url": "https://t.co/9xD9Bb1Olp", "name": "Imoleayo Michael \ud83d\udc51 {iCode}\ud83d\udd4a\ufe0f", "protected": false, "public_metrics": {"followers_count": 9282, "following_count": 7714, "tweet_count": 27128, "listed_count": 9}, "pinned_tweet_id": "1384780512791781376", "entities": {"url": {"urls": [{"start": 0, "end": 23, "url": "https://t.co/9xD9Bb1Olp", "expanded_url": "http://icoderesources.com.ng", "display_url": "icoderesources.com.ng"}]}, "description": {"hashtags": [{"start": 52, "end": 63, "tag": "Programmer"}, {"start": 64, "end": 70, "tag": "Virgo"}, {"start": 71, "end": 77, "tag": "Coder"}]}}, "description": "CEO, iCode Resources | Tech Guru | Weirdo | Coder | #Programmer #Virgo #Coder", "created_at": "2017-11-05T11:02:42.000Z", "verified": false, "username": "imoleayomichael", "location": "192.168.1.1", "profile_image_url": "https://pbs.twimg.com/profile_images/1342557547534966785/jp4DiwSW_normal.jpg"}]}, "__twarc": {"url": "https://api.twitter.com/2/tweets?expansions=author_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id%2Centities.mentions.username%2Cattachments.poll_ids%2Cattachments.media_keys%2Cgeo.place_id&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Ctext%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Cwithheld&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&media.fields=alt_text%2Cduration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cpublic_metrics&poll.fields=duration_minutes%2Cend_datetime%2Cid%2Coptions%2Cvoting_status&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1388174000472432650", "version": "2.9.1", "retrieved_at": "2022-02-05T15:51:22+00:00"}} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,32 +6,29 @@ | |
from twarc.expansions import ensure_flattened | ||
from twarc.decorators2 import FileSizeProgressBar | ||
|
||
|
||
@click.command() | ||
@click.option( | ||
"--group", | ||
"--group", | ||
"-g", | ||
type=click.Choice(["day", "week", "month", "year"]), | ||
help="Group hashtag results by time" | ||
help="Group hashtag results by time", | ||
) | ||
@click.option( | ||
"--db", | ||
"-d", | ||
"db_path", | ||
default="hashtags.db", | ||
help="Path to use for the SQLite database" | ||
help="Path to use for the SQLite database", | ||
) | ||
@click.option( | ||
"--skip-import", | ||
"-s", | ||
is_flag=True, | ||
help="Skip loading the tweets and use existing SQLite database" | ||
help="Skip loading the tweets and use existing SQLite database", | ||
) | ||
@click.option( | ||
"--limit", | ||
"-l", | ||
type=int, | ||
default=0, | ||
help="Limit output to this many hashtags" | ||
"--limit", "-l", type=int, default=0, help="Limit output to this many hashtags" | ||
) | ||
@click.argument("infile", type=click.File("r"), default="-") | ||
@click.argument("outfile", type=click.File("w"), default="-") | ||
|
@@ -51,7 +48,7 @@ def hashtags(group, db_path, limit, skip_import, infile, outfile): | |
|
||
def load(infile, outfile, db): | ||
|
||
with FileSizeProgressBar(infile, outfile) as progress: | ||
with FileSizeProgressBar(infile, outfile) as progress: | ||
for line in infile: | ||
progress.update(len(line)) | ||
|
||
|
@@ -62,20 +59,43 @@ def load(infile, outfile, db): | |
|
||
data = json.loads(line) | ||
for tweet in ensure_flattened(data): | ||
# Process Retweets: | ||
if "referenced_tweets" in tweet: | ||
rts = [ | ||
t | ||
for t in tweet["referenced_tweets"] | ||
if t["type"] == "retweeted" | ||
] | ||
retweeted_tweet = rts[-1] if rts else None | ||
# If it's a native retweet, replace the "RT @user Text" with the original text, metrics, and entities, but keep the Author. | ||
if retweeted_tweet: | ||
# A retweet inherits everything from retweeted tweet. | ||
tweet["text"] = retweeted_tweet.pop( | ||
"text", tweet.pop("text", None) | ||
) | ||
tweet["entities"] = retweeted_tweet.pop( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we are only interested in hashtags isn't entities all that is needed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but i'm inclined to leave them in because it's more explicit about what's happening - and in case there's any future additions / modifications that won't throw up any surprises. I don't think it hurts to have the extra bits there, but i'm also ok with just commenting them out. |
||
"entities", tweet.pop("entities", None) | ||
) | ||
tweet["attachments"] = retweeted_tweet.pop( | ||
"attachments", tweet.pop("attachments", None) | ||
) | ||
tweet["context_annotations"] = retweeted_tweet.pop( | ||
"context_annotations", | ||
tweet.pop("context_annotations", None), | ||
) | ||
tweet["public_metrics"] = retweeted_tweet.pop( | ||
"public_metrics", tweet.pop("public_metrics", None) | ||
) | ||
if "entities" in tweet and "hashtags" in tweet["entities"]: | ||
for hashtag in tweet["entities"]["hashtags"]: | ||
db.execute( | ||
""" | ||
INSERT INTO hashtags (id, created, hashtag) | ||
VALUES (?, ?, ?) | ||
""", | ||
( | ||
tweet["id"], | ||
tweet["created_at"], | ||
hashtag["tag"].lower() | ||
) | ||
(tweet["id"], tweet["created_at"], hashtag["tag"].lower()), | ||
) | ||
|
||
db.commit() | ||
|
||
|
||
|
@@ -97,8 +117,7 @@ def export(outfile, db, group, limit): | |
elif group == "year": | ||
fmt = "%Y" | ||
|
||
sql = \ | ||
""" | ||
sql = """ | ||
SELECT | ||
hashtag, | ||
STRFTIME(?, created) AS time, | ||
|
@@ -112,8 +131,7 @@ def export(outfile, db, group, limit): | |
|
||
# otherwise we're doing a global count | ||
else: | ||
sql = \ | ||
""" | ||
sql = """ | ||
SELECT hashtag, COUNT(*) AS tweets | ||
FROM HASHTAGS | ||
GROUP BY hashtag | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe we should consider hashtags in any referenced tweet?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought about that but i was unsure how to handle it. Maybe it should have 1 switch for all?
--include-referenced-tweets
or ``--count-referenced-tweetsor add ones for each type:
--count-replies` and `--count-quotes` etc?