Skip to content

Commit

Permalink
rate limit
Browse files Browse the repository at this point in the history
  • Loading branch information
yu23ki14 committed Aug 6, 2024
1 parent ba81dba commit fc702a3
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 23 deletions.
28 changes: 18 additions & 10 deletions etl/src/birdxplorer_etl/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,18 @@ def extract_data(db: Session):
for note in postExtract_targetNotes:
tweet_id = note.tweet_id

is_tweetExist = db.query(RowPostRecord).filter(RowPostRecord.post_id == tweet_id).first()
is_tweetExist = db.query(RowPostRecord).filter(RowPostRecord.post_id == str(tweet_id)).first()
if is_tweetExist is not None:
logger.info(f"tweet_id {tweet_id} is already exist")
note.row_post_id = tweet_id
continue

logger.info(tweet_id)
post = lookup(tweet_id)

if "data" not in post:
continue

created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
created_at_millis = int(created_at.timestamp() * 1000)

Expand All @@ -97,21 +102,24 @@ def extract_data(db: Session):
followers_count=user_data.get("public_metrics", {}).get("followers_count"),
following_count=user_data.get("public_metrics", {}).get("following_count"),
tweet_count=user_data.get("public_metrics", {}).get("tweet_count"),
verified=user_data.get("verified"),
verified_type=user_data.get("verified_type"),
location=user_data.get("location"),
verified=user_data.get("verified", False),
verified_type=user_data.get("verified_type", ""),
location=user_data.get("location", ""),
url=user_data.get("url", ""),
)
db.add(db_user)

media_url = post["includes"]["media"][0]["url"] if "media" in post["includes"] else ""
media_type = post["includes"]["media"][0]["type"] if "media" in post["includes"] else ""
media_data = (
post["includes"]["media"][0]
if "includes" in post and "media" in post["includes"] and len(post["includes"]["media"]) > 0
else {}
)
db_post = RowPostRecord(
post_id=post["data"]["id"],
author_id=post["data"]["author_id"],
text=post["data"]["text"],
media_type=media_type,
media_url=media_url,
media_type=media_data.get("type", ""),
media_url=media_data.get("url", ""),
created_at=created_at_millis,
like_count=post["data"]["public_metrics"]["like_count"],
repost_count=post["data"]["public_metrics"]["retweet_count"],
Expand All @@ -123,9 +131,9 @@ def extract_data(db: Session):
)
db.add(db_post)
note.row_post_id = tweet_id
time.sleep(60)
time.sleep(90)
db.commit()
continue
db.commit()

# select note from db, get relation tweet and user data
note = db.query(RowNoteRecord).filter(RowNoteRecord.tweet_id == "1797617478950170784").first()
Expand Down
27 changes: 14 additions & 13 deletions etl/src/birdxplorer_etl/lib/x/postlookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,20 @@
import settings
from prefect import get_run_logger


def create_url(id):
logger = get_run_logger()
expansions = 'expansions=attachments.poll_ids,attachments.media_keys,author_id,edit_history_tweet_ids,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id'
expansions = "expansions=attachments.poll_ids,attachments.media_keys,author_id,edit_history_tweet_ids,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id"
tweet_fields = "tweet.fields=attachments,author_id,context_annotations,conversation_id,created_at,edit_controls,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld"
media_fields = 'media.fields=duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,alt_text,variants'
place_fields = 'place.fields=contained_within,country,country_code,full_name,geo,id,name,place_type'
user_fields = 'user.fields=created_at,description,entities,id,location,most_recent_tweet_id,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,verified_type,withheld'

url = "https://api.twitter.com/2/tweets/{}?{}&{}&{}&{}&{}".format(id, tweet_fields, expansions,media_fields,place_fields,user_fields)
media_fields = (
"media.fields=duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,alt_text,variants"
)
place_fields = "place.fields=contained_within,country,country_code,full_name,geo,id,name,place_type"
user_fields = "user.fields=created_at,description,entities,id,location,most_recent_tweet_id,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,verified_type,withheld"

url = "https://api.twitter.com/2/tweets/{}?{}&{}&{}&{}&{}".format(
id, tweet_fields, expansions, media_fields, place_fields, user_fields
)
logger.info(url)
return url

Expand All @@ -30,18 +35,14 @@ def connect_to_endpoint(url):
response = requests.request("GET", url, auth=bearer_oauth)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Request returned an error: {} {}".format(
response.status_code, response.text
)
)
raise Exception("Request returned an error: {} {}".format(response.status_code, response.text))
return response.json()


def lookup(id):
url = create_url(id)
json_response = connect_to_endpoint(url)
print(json.dumps(json_response, indent=4, sort_keys=True))
return json_response

# https://oauth-playground.glitch.me/?id=findTweetsById&params=%28%27query%21%28%27C%27*B%29%7Ebody%21%28%29%7Epath%21%28%29*B%7EFAG%27%7EuserADfile_image_url%2CiNcreated_at%2CconnectK_statuHurlMublic_JtricHuserDtecteNentitieHdescriptK%27%7ECG%2Creferenced_Fs.id-keys-source_F%27%7EOAtype%2Curl%27%29*%7EidL146E37035677698IE43339741184I-%2CattachJnts.O_A.fieldLBE46120540165%27CexpansKLDnaJMroE03237FtweetGauthor_idHs%2CI%2C146JmeKionLs%21%27M%2CpNd%2COJdia%01ONMLKJIHGFEDCBA-*_

# https://oauth-playground.glitch.me/?id=findTweetsById&params=%28%27query%21%28%27C%27*B%29%7Ebody%21%28%29%7Epath%21%28%29*B%7EFAG%27%7EuserADfile_image_url%2CiNcreated_at%2CconnectK_statuHurlMublic_JtricHuserDtecteNentitieHdescriptK%27%7ECG%2Creferenced_Fs.id-keys-source_F%27%7EOAtype%2Curl%27%29*%7EidL146E37035677698IE43339741184I-%2CattachJnts.O_A.fieldLBE46120540165%27CexpansKLDnaJMroE03237FtweetGauthor_idHs%2CI%2C146JmeKionLs%21%27M%2CpNd%2COJdia%01ONMLKJIHGFEDCBA-*_

0 comments on commit fc702a3

Please sign in to comment.