From 736c2723b0825eeac1d8223817d56b6bc26d1b27 Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Wed, 7 Aug 2024 22:38:34 +0900 Subject: [PATCH] update for rate limit --- etl/src/birdxplorer_etl/extract.py | 11 +------- etl/src/birdxplorer_etl/lib/x/postlookup.py | 28 +++++++++++++++------ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py index 0e9b6fb..81abf65 100644 --- a/etl/src/birdxplorer_etl/extract.py +++ b/etl/src/birdxplorer_etl/extract.py @@ -7,7 +7,6 @@ from lib.x.postlookup import lookup from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord import settings -import time def extract_data(db: Session): @@ -51,13 +50,6 @@ def extract_data(db: Session): db.commit() - # post = lookup() - # created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") - # created_at_millis = int(created_at.timestamp() * 1000) - # db_post = RowPostRecord(post_id=post["data"]["id"], author_id=post["data"]["author_id"], text=post["data"]["text"], created_at=created_at_millis,like_count=post["data"]["public_metrics"]["like_count"],repost_count=post["data"]["public_metrics"]["retweet_count"],bookmark_count=post["data"]["public_metrics"]["bookmark_count"],impression_count=post["data"]["public_metrics"]["impression_count"],quote_count=post["data"]["public_metrics"]["quote_count"],reply_count=post["data"]["public_metrics"]["reply_count"],lang=post["data"]["lang"]) - # db.add(db_post) - # db.commit() - # Noteに紐づくtweetデータを取得 postExtract_targetNotes = ( db.query(RowNoteRecord) @@ -79,7 +71,7 @@ def extract_data(db: Session): logger.info(tweet_id) post = lookup(tweet_id) - if "data" not in post: + if post == None or "data" not in post: continue created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") @@ -131,7 +123,6 @@ def extract_data(db: Session): ) db.add(db_post) note.row_post_id = tweet_id - time.sleep(90) db.commit() continue diff --git a/etl/src/birdxplorer_etl/lib/x/postlookup.py b/etl/src/birdxplorer_etl/lib/x/postlookup.py index 557cb9e..1410ceb 100644 --- a/etl/src/birdxplorer_etl/lib/x/postlookup.py +++ b/etl/src/birdxplorer_etl/lib/x/postlookup.py @@ -1,11 +1,10 @@ import requests -import json import settings from prefect import get_run_logger +import time def create_url(id): - logger = get_run_logger() expansions = "expansions=attachments.poll_ids,attachments.media_keys,author_id,edit_history_tweet_ids,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id" tweet_fields = "tweet.fields=attachments,author_id,context_annotations,conversation_id,created_at,edit_controls,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld" media_fields = ( @@ -17,7 +16,6 @@ def create_url(id): url = "https://api.twitter.com/2/tweets/{}?{}&{}&{}&{}&{}".format( id, tweet_fields, expansions, media_fields, place_fields, user_fields ) - logger.info(url) return url @@ -32,17 +30,31 @@ def bearer_oauth(r): def connect_to_endpoint(url): + logger = get_run_logger() response = requests.request("GET", url, auth=bearer_oauth) - print(response.status_code) - if response.status_code != 200: + if response.status_code == 429: + limit = response.headers["x-rate-limit-reset"] + logger.info("Waiting for rate limit reset...") + time.sleep(int(limit) - int(time.time()) + 1) + data = connect_to_endpoint(url) + return data + elif response.status_code != 200: raise Exception("Request returned an error: {} {}".format(response.status_code, response.text)) return response.json() +def check_existence(id): + url = "https://publish.twitter.com/oembed?url=https://x.com/CommunityNotes/status/{}&partner=&hide_thread=false".format( + id + ) + status = requests.get(url).status_code + return status == 200 + + def lookup(id): + isExist = check_existence(id) + if not isExist: + return None url = create_url(id) json_response = connect_to_endpoint(url) return json_response - - -# https://oauth-playground.glitch.me/?id=findTweetsById¶ms=%28%27query%21%28%27C%27*B%29%7Ebody%21%28%29%7Epath%21%28%29*B%7EFAG%27%7EuserADfile_image_url%2CiNcreated_at%2CconnectK_statuHurlMublic_JtricHuserDtecteNentitieHdescriptK%27%7ECG%2Creferenced_Fs.id-keys-source_F%27%7EOAtype%2Curl%27%29*%7EidL146E37035677698IE43339741184I-%2CattachJnts.O_A.fieldLBE46120540165%27CexpansKLDnaJMroE03237FtweetGauthor_idHs%2CI%2C146JmeKionLs%21%27M%2CpNd%2COJdia%01ONMLKJIHGFEDCBA-*_