From fc702a315266007a6cf7d6fb342a219952fb9319 Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Tue, 6 Aug 2024 09:40:44 +0900 Subject: [PATCH] rate limit --- etl/src/birdxplorer_etl/extract.py | 28 +++++++++++++-------- etl/src/birdxplorer_etl/lib/x/postlookup.py | 27 ++++++++++---------- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py index 878d4f9..0e9b6fb 100644 --- a/etl/src/birdxplorer_etl/extract.py +++ b/etl/src/birdxplorer_etl/extract.py @@ -70,13 +70,18 @@ def extract_data(db: Session): for note in postExtract_targetNotes: tweet_id = note.tweet_id - is_tweetExist = db.query(RowPostRecord).filter(RowPostRecord.post_id == tweet_id).first() + is_tweetExist = db.query(RowPostRecord).filter(RowPostRecord.post_id == str(tweet_id)).first() if is_tweetExist is not None: + logger.info(f"tweet_id {tweet_id} is already exist") note.row_post_id = tweet_id continue logger.info(tweet_id) post = lookup(tweet_id) + + if "data" not in post: + continue + created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") created_at_millis = int(created_at.timestamp() * 1000) @@ -97,21 +102,24 @@ def extract_data(db: Session): followers_count=user_data.get("public_metrics", {}).get("followers_count"), following_count=user_data.get("public_metrics", {}).get("following_count"), tweet_count=user_data.get("public_metrics", {}).get("tweet_count"), - verified=user_data.get("verified"), - verified_type=user_data.get("verified_type"), - location=user_data.get("location"), + verified=user_data.get("verified", False), + verified_type=user_data.get("verified_type", ""), + location=user_data.get("location", ""), url=user_data.get("url", ""), ) db.add(db_user) - media_url = post["includes"]["media"][0]["url"] if "media" in post["includes"] else "" - media_type = post["includes"]["media"][0]["type"] if "media" in post["includes"] else "" + media_data = ( + post["includes"]["media"][0] + if "includes" in post and "media" in post["includes"] and len(post["includes"]["media"]) > 0 + else {} + ) db_post = RowPostRecord( post_id=post["data"]["id"], author_id=post["data"]["author_id"], text=post["data"]["text"], - media_type=media_type, - media_url=media_url, + media_type=media_data.get("type", ""), + media_url=media_data.get("url", ""), created_at=created_at_millis, like_count=post["data"]["public_metrics"]["like_count"], repost_count=post["data"]["public_metrics"]["retweet_count"], @@ -123,9 +131,9 @@ def extract_data(db: Session): ) db.add(db_post) note.row_post_id = tweet_id - time.sleep(60) + time.sleep(90) + db.commit() continue - db.commit() # select note from db, get relation tweet and user data note = db.query(RowNoteRecord).filter(RowNoteRecord.tweet_id == "1797617478950170784").first() diff --git a/etl/src/birdxplorer_etl/lib/x/postlookup.py b/etl/src/birdxplorer_etl/lib/x/postlookup.py index f949492..557cb9e 100644 --- a/etl/src/birdxplorer_etl/lib/x/postlookup.py +++ b/etl/src/birdxplorer_etl/lib/x/postlookup.py @@ -3,15 +3,20 @@ import settings from prefect import get_run_logger + def create_url(id): logger = get_run_logger() - expansions = 'expansions=attachments.poll_ids,attachments.media_keys,author_id,edit_history_tweet_ids,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id' + expansions = "expansions=attachments.poll_ids,attachments.media_keys,author_id,edit_history_tweet_ids,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id" tweet_fields = "tweet.fields=attachments,author_id,context_annotations,conversation_id,created_at,edit_controls,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld" - media_fields = 'media.fields=duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,alt_text,variants' - place_fields = 'place.fields=contained_within,country,country_code,full_name,geo,id,name,place_type' - user_fields = 'user.fields=created_at,description,entities,id,location,most_recent_tweet_id,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,verified_type,withheld' - - url = "https://api.twitter.com/2/tweets/{}?{}&{}&{}&{}&{}".format(id, tweet_fields, expansions,media_fields,place_fields,user_fields) + media_fields = ( + "media.fields=duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,alt_text,variants" + ) + place_fields = "place.fields=contained_within,country,country_code,full_name,geo,id,name,place_type" + user_fields = "user.fields=created_at,description,entities,id,location,most_recent_tweet_id,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,verified_type,withheld" + + url = "https://api.twitter.com/2/tweets/{}?{}&{}&{}&{}&{}".format( + id, tweet_fields, expansions, media_fields, place_fields, user_fields + ) logger.info(url) return url @@ -30,18 +35,14 @@ def connect_to_endpoint(url): response = requests.request("GET", url, auth=bearer_oauth) print(response.status_code) if response.status_code != 200: - raise Exception( - "Request returned an error: {} {}".format( - response.status_code, response.text - ) - ) + raise Exception("Request returned an error: {} {}".format(response.status_code, response.text)) return response.json() def lookup(id): url = create_url(id) json_response = connect_to_endpoint(url) - print(json.dumps(json_response, indent=4, sort_keys=True)) return json_response -# https://oauth-playground.glitch.me/?id=findTweetsById¶ms=%28%27query%21%28%27C%27*B%29%7Ebody%21%28%29%7Epath%21%28%29*B%7EFAG%27%7EuserADfile_image_url%2CiNcreated_at%2CconnectK_statuHurlMublic_JtricHuserDtecteNentitieHdescriptK%27%7ECG%2Creferenced_Fs.id-keys-source_F%27%7EOAtype%2Curl%27%29*%7EidL146E37035677698IE43339741184I-%2CattachJnts.O_A.fieldLBE46120540165%27CexpansKLDnaJMroE03237FtweetGauthor_idHs%2CI%2C146JmeKionLs%21%27M%2CpNd%2COJdia%01ONMLKJIHGFEDCBA-*_ \ No newline at end of file + +# https://oauth-playground.glitch.me/?id=findTweetsById¶ms=%28%27query%21%28%27C%27*B%29%7Ebody%21%28%29%7Epath%21%28%29*B%7EFAG%27%7EuserADfile_image_url%2CiNcreated_at%2CconnectK_statuHurlMublic_JtricHuserDtecteNentitieHdescriptK%27%7ECG%2Creferenced_Fs.id-keys-source_F%27%7EOAtype%2Curl%27%29*%7EidL146E37035677698IE43339741184I-%2CattachJnts.O_A.fieldLBE46120540165%27CexpansKLDnaJMroE03237FtweetGauthor_idHs%2CI%2C146JmeKionLs%21%27M%2CpNd%2COJdia%01ONMLKJIHGFEDCBA-*_