Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change counting hashtags in retweets #2

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions test-data/tweets3.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"data": [{"id": "1388203310327508995", "entities": {"annotations": [{"start": 47, "end": 49, "probability": 0.5491, "type": "Organization", "normalized_text": "DSS"}, {"start": 95, "end": 100, "probability": 0.4221, "type": "Place", "normalized_text": "41days"}], "hashtags": [{"start": 107, "end": 115, "tag": "EndSARS"}], "mentions": [{"start": 3, "end": 12, "username": "abjghost", "id": "707540943134711808"}, {"start": 14, "end": 30, "username": "imoleayomichael", "id": "927129038933626880"}]}, "possibly_sensitive": false, "conversation_id": "1388203310327508995", "lang": "en", "text": "RT @abjghost: @imoleayomichael was abducted by DSS at 2.30am in his residence and detained for 41days over #EndSARS protest. They still wan\u2026", "public_metrics": {"retweet_count": 65, "reply_count": 0, "like_count": 0, "quote_count": 0}, "source": "Twitter for iPhone", "reply_settings": "everyone", "referenced_tweets": [{"type": "retweeted", "id": "1388174000472432650"}], "author_id": "364212595", "created_at": "2021-04-30T18:47:15.000Z"}], "includes": {"users": [{"id": "364212595", "public_metrics": {"followers_count": 716, "following_count": 637, "tweet_count": 7430, "listed_count": 0}, "profile_image_url": "https://pbs.twimg.com/profile_images/1463823816249905152/P70DYQO9_normal.jpg", "verified": false, "username": "boo_chiiii", "name": "Dr B", "created_at": "2011-08-29T11:52:16.000Z", "description": "", "url": "", "protected": false}, {"id": "707540943134711808", "public_metrics": {"followers_count": 7331, "following_count": 7700, "tweet_count": 23616, "listed_count": 2}, "profile_image_url": "https://pbs.twimg.com/profile_images/1368867227240390657/DIEHzhPo_normal.jpg", "verified": false, "username": "abjghost", "name": "JagunJagun\ud83c\udf11", "pinned_tweet_id": "1342577248029827079", "created_at": "2016-03-09T12:18:01.000Z", "location": "Abuja, Nigeria", "description": "Abuja Connect|Digital Media Plug|Manchester United|Very Social", "url": "", "protected": false}, {"id": "927129038933626880", "public_metrics": {"followers_count": 9282, "following_count": 7714, "tweet_count": 27128, "listed_count": 9}, "profile_image_url": "https://pbs.twimg.com/profile_images/1342557547534966785/jp4DiwSW_normal.jpg", "verified": false, "username": "imoleayomichael", "name": "Imoleayo Michael \ud83d\udc51 {iCode}\ud83d\udd4a\ufe0f", "pinned_tweet_id": "1384780512791781376", "created_at": "2017-11-05T11:02:42.000Z", "location": "192.168.1.1", "entities": {"url": {"urls": [{"start": 0, "end": 23, "url": "https://t.co/9xD9Bb1Olp", "expanded_url": "http://icoderesources.com.ng", "display_url": "icoderesources.com.ng"}]}, "description": {"hashtags": [{"start": 52, "end": 63, "tag": "Programmer"}, {"start": 64, "end": 70, "tag": "Virgo"}, {"start": 71, "end": 77, "tag": "Coder"}]}}, "description": "CEO, iCode Resources | Tech Guru | Weirdo | Coder | #Programmer #Virgo #Coder", "url": "https://t.co/9xD9Bb1Olp", "protected": false}], "tweets": [{"id": "1388174000472432650", "entities": {"annotations": [{"start": 33, "end": 35, "probability": 0.4805, "type": "Organization", "normalized_text": "DSS"}, {"start": 81, "end": 86, "probability": 0.3244, "type": "Person", "normalized_text": "41days"}, {"start": 143, "end": 151, "probability": 0.4881, "type": "Person", "normalized_text": "Imoleayo"}], "urls": [{"start": 280, "end": 303, "url": "https://t.co/fDgTVvbQBZ", "expanded_url": "https://twitter.com/abjghost/status/1388174000472432650/photo/1", "display_url": "pic.twitter.com/fDgTVvbQBZ"}], "hashtags": [{"start": 93, "end": 101, "tag": "EndSARS"}, {"start": 224, "end": 237, "tag": "FreeImoleAyo"}], "mentions": [{"start": 0, "end": 16, "username": "imoleayomichael", "id": "927129038933626880"}]}, "possibly_sensitive": true, "attachments": {"media_keys": ["3_1388173984898879492"]}, "conversation_id": "1388174000472432650", "lang": "en", "text": "@imoleayomichael was abducted by DSS at 2.30am in his residence and detained for 41days over #EndSARS protest. They still want to convict him.\n\nImoleayo is a Programmer NOT A CRIMINAL!\n\nPls lend your voice in solidarity to \n#FreeImoleAyo\nIt could be you or me.\nPls tweet, RT, Tag https://t.co/fDgTVvbQBZ", "public_metrics": {"retweet_count": 65, "reply_count": 1, "like_count": 49, "quote_count": 0}, "context_annotations": [{"domain": {"id": "65", "name": "Interests and Hobbies Vertical", "description": "Top level interests and hobbies groupings, like Food or Travel"}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "848921413196984320", "name": "Computer programming", "description": "Computer programming"}}], "source": "Twitter for iPhone", "reply_settings": "everyone", "author_id": "707540943134711808", "in_reply_to_user_id": "927129038933626880", "created_at": "2021-04-30T16:50:47.000Z"}]}, "__twarc": {"url": "https://api.twitter.com/2/tweets?expansions=author_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id%2Centities.mentions.username%2Cattachments.poll_ids%2Cattachments.media_keys%2Cgeo.place_id&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Ctext%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Cwithheld&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&media.fields=alt_text%2Cduration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cpublic_metrics&poll.fields=duration_minutes%2Cend_datetime%2Cid%2Coptions%2Cvoting_status&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1388203310327508995", "version": "2.9.1", "retrieved_at": "2022-02-05T15:50:57+00:00"}}
{"data": [{"entities": {"annotations": [{"start": 33, "end": 35, "probability": 0.4805, "type": "Organization", "normalized_text": "DSS"}, {"start": 81, "end": 86, "probability": 0.3244, "type": "Person", "normalized_text": "41days"}, {"start": 143, "end": 151, "probability": 0.4881, "type": "Person", "normalized_text": "Imoleayo"}], "urls": [{"start": 280, "end": 303, "url": "https://t.co/fDgTVvbQBZ", "expanded_url": "https://twitter.com/abjghost/status/1388174000472432650/photo/1", "display_url": "pic.twitter.com/fDgTVvbQBZ"}], "hashtags": [{"start": 93, "end": 101, "tag": "EndSARS"}, {"start": 224, "end": 237, "tag": "FreeImoleAyo"}], "mentions": [{"start": 0, "end": 16, "username": "imoleayomichael", "id": "927129038933626880"}]}, "author_id": "707540943134711808", "conversation_id": "1388174000472432650", "source": "Twitter for iPhone", "in_reply_to_user_id": "927129038933626880", "possibly_sensitive": true, "id": "1388174000472432650", "attachments": {"media_keys": ["3_1388173984898879492"]}, "text": "@imoleayomichael was abducted by DSS at 2.30am in his residence and detained for 41days over #EndSARS protest. They still want to convict him.\n\nImoleayo is a Programmer NOT A CRIMINAL!\n\nPls lend your voice in solidarity to \n#FreeImoleAyo\nIt could be you or me.\nPls tweet, RT, Tag https://t.co/fDgTVvbQBZ", "reply_settings": "everyone", "public_metrics": {"retweet_count": 65, "reply_count": 1, "like_count": 49, "quote_count": 0}, "lang": "en", "context_annotations": [{"domain": {"id": "65", "name": "Interests and Hobbies Vertical", "description": "Top level interests and hobbies groupings, like Food or Travel"}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "848921413196984320", "name": "Computer programming", "description": "Computer programming"}}], "created_at": "2021-04-30T16:50:47.000Z"}], "includes": {"media": [{"media_key": "3_1388173984898879492", "height": 810, "url": "https://pbs.twimg.com/media/E0PI4mhWEAQPAzY.jpg", "type": "photo", "width": 1440}], "users": [{"id": "707540943134711808", "url": "", "name": "JagunJagun\ud83c\udf11", "protected": false, "public_metrics": {"followers_count": 7331, "following_count": 7700, "tweet_count": 23616, "listed_count": 2}, "pinned_tweet_id": "1342577248029827079", "description": "Abuja Connect|Digital Media Plug|Manchester United|Very Social", "created_at": "2016-03-09T12:18:01.000Z", "verified": false, "username": "abjghost", "location": "Abuja, Nigeria", "profile_image_url": "https://pbs.twimg.com/profile_images/1368867227240390657/DIEHzhPo_normal.jpg"}, {"id": "927129038933626880", "url": "https://t.co/9xD9Bb1Olp", "name": "Imoleayo Michael \ud83d\udc51 {iCode}\ud83d\udd4a\ufe0f", "protected": false, "public_metrics": {"followers_count": 9282, "following_count": 7714, "tweet_count": 27128, "listed_count": 9}, "pinned_tweet_id": "1384780512791781376", "entities": {"url": {"urls": [{"start": 0, "end": 23, "url": "https://t.co/9xD9Bb1Olp", "expanded_url": "http://icoderesources.com.ng", "display_url": "icoderesources.com.ng"}]}, "description": {"hashtags": [{"start": 52, "end": 63, "tag": "Programmer"}, {"start": 64, "end": 70, "tag": "Virgo"}, {"start": 71, "end": 77, "tag": "Coder"}]}}, "description": "CEO, iCode Resources | Tech Guru | Weirdo | Coder | #Programmer #Virgo #Coder", "created_at": "2017-11-05T11:02:42.000Z", "verified": false, "username": "imoleayomichael", "location": "192.168.1.1", "profile_image_url": "https://pbs.twimg.com/profile_images/1342557547534966785/jp4DiwSW_normal.jpg"}]}, "__twarc": {"url": "https://api.twitter.com/2/tweets?expansions=author_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id%2Centities.mentions.username%2Cattachments.poll_ids%2Cattachments.media_keys%2Cgeo.place_id&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Ctext%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Cwithheld&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&media.fields=alt_text%2Cduration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cpublic_metrics&poll.fields=duration_minutes%2Cend_datetime%2Cid%2Coptions%2Cvoting_status&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1388174000472432650", "version": "2.9.1", "retrieved_at": "2022-02-05T15:51:22+00:00"}}
80 changes: 55 additions & 25 deletions test_twarc_hashtags.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,71 @@

runner = CliRunner()


def test_basic():
result = runner.invoke(hashtags, ['test-data/tweets1.jsonl'])
result = runner.invoke(hashtags, ["test-data/tweets1.jsonl"])
assert result.exit_code == 0
assert result.output == \
'''hashtag,tweets
assert (
result.output
== """hashtag,tweets
gettyimagesnews,2
naacpimageawards,2
usa,1
trafficking,1
refugees,1
obama,1
isis,1
illegals,1
cartel,1
savetheseabirds,2
biden,1
'''
boulder,1
cartel,1
forthepeopleact,1
guncontrolnow,1
illegals,1
isis,1
islarebelde,1
nft,1
niunamenos,1
nra,1
obama,1
patriaomuerte,1
refugees,1
sethrich,1
trafficking,1
usa,1
"""
)


def test_group():
result = runner.invoke(hashtags, ['--group', 'day', 'test-data/tweets2.jsonl'])
result = runner.invoke(hashtags, ["--group", "day", "test-data/tweets2.jsonl"])
assert result.exit_code == 0
assert result.output.startswith(
'''hashtag,time,tweets
"""hashtag,time,tweets
ethereum,2021-08-20,13
bitcoin,2021-08-20,10
reinstate45,2021-08-20,5
hypocrisy,2021-08-20,5
fbi,2021-08-20,5
''')
hypocrisy,2021-08-20,5
"""
)


def test_limit():
result = runner.invoke(hashtags, ['--limit', '5', 'test-data/tweets2.jsonl'])
result = runner.invoke(hashtags, ["--limit", "5", "test-data/tweets2.jsonl"])
assert result.exit_code == 0
assert result.output == \
'''hashtag,tweets
wtfhappenedin1971,389
bitcoin,372
banliznowjack,230
btc,113
farmersprotest,111
'''
assert (
result.output
== """hashtag,tweets
banliznowjack,650
wtfhappenedin1971,599
bitcoin,469
farmersprotest,128
btc,121
"""
)


def test_retweets():
result = runner.invoke(hashtags, ["test-data/tweets3.jsonl"])
assert result.exit_code == 0
assert result.output.startswith(
"""hashtag,tweets
endsars,2
freeimoleayo,2
"""
)
58 changes: 38 additions & 20 deletions twarc_hashtags.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,29 @@
from twarc.expansions import ensure_flattened
from twarc.decorators2 import FileSizeProgressBar


@click.command()
@click.option(
"--group",
"--group",
"-g",
type=click.Choice(["day", "week", "month", "year"]),
help="Group hashtag results by time"
help="Group hashtag results by time",
)
@click.option(
"--db",
"-d",
"db_path",
default="hashtags.db",
help="Path to use for the SQLite database"
help="Path to use for the SQLite database",
)
@click.option(
"--skip-import",
"-s",
is_flag=True,
help="Skip loading the tweets and use existing SQLite database"
help="Skip loading the tweets and use existing SQLite database",
)
@click.option(
"--limit",
"-l",
type=int,
default=0,
help="Limit output to this many hashtags"
"--limit", "-l", type=int, default=0, help="Limit output to this many hashtags"
)
@click.argument("infile", type=click.File("r"), default="-")
@click.argument("outfile", type=click.File("w"), default="-")
Expand All @@ -51,7 +48,7 @@ def hashtags(group, db_path, limit, skip_import, infile, outfile):

def load(infile, outfile, db):

with FileSizeProgressBar(infile, outfile) as progress:
with FileSizeProgressBar(infile, outfile) as progress:
for line in infile:
progress.update(len(line))

Expand All @@ -62,20 +59,43 @@ def load(infile, outfile, db):

data = json.loads(line)
for tweet in ensure_flattened(data):
# Process Retweets:
if "referenced_tweets" in tweet:
rts = [
Copy link
Member

@edsu edsu Feb 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should consider hashtags in any referenced tweet?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about that but i was unsure how to handle it. Maybe it should have 1 switch for all? --include-referenced-tweets or ``--count-referenced-tweetsor add ones for each type:--count-replies` and `--count-quotes` etc?

t
for t in tweet["referenced_tweets"]
if t["type"] == "retweeted"
]
retweeted_tweet = rts[-1] if rts else None
# If it's a native retweet, replace the "RT @user Text" with the original text, metrics, and entities, but keep the Author.
if retweeted_tweet:
# A retweet inherits everything from retweeted tweet.
tweet["text"] = retweeted_tweet.pop(
"text", tweet.pop("text", None)
)
tweet["entities"] = retweeted_tweet.pop(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we are only interested in hashtags isn't entities all that is needed?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but i'm inclined to leave them in because it's more explicit about what's happening - and in case there's any future additions / modifications that won't throw up any surprises. I don't think it hurts to have the extra bits there, but i'm also ok with just commenting them out.

"entities", tweet.pop("entities", None)
)
tweet["attachments"] = retweeted_tweet.pop(
"attachments", tweet.pop("attachments", None)
)
tweet["context_annotations"] = retweeted_tweet.pop(
"context_annotations",
tweet.pop("context_annotations", None),
)
tweet["public_metrics"] = retweeted_tweet.pop(
"public_metrics", tweet.pop("public_metrics", None)
)
if "entities" in tweet and "hashtags" in tweet["entities"]:
for hashtag in tweet["entities"]["hashtags"]:
db.execute(
"""
INSERT INTO hashtags (id, created, hashtag)
VALUES (?, ?, ?)
""",
(
tweet["id"],
tweet["created_at"],
hashtag["tag"].lower()
)
(tweet["id"], tweet["created_at"], hashtag["tag"].lower()),
)

db.commit()


Expand All @@ -97,8 +117,7 @@ def export(outfile, db, group, limit):
elif group == "year":
fmt = "%Y"

sql = \
"""
sql = """
SELECT
hashtag,
STRFTIME(?, created) AS time,
Expand All @@ -112,8 +131,7 @@ def export(outfile, db, group, limit):

# otherwise we're doing a global count
else:
sql = \
"""
sql = """
SELECT hashtag, COUNT(*) AS tweets
FROM HASHTAGS
GROUP BY hashtag
Expand Down