Skip to content

Commit

Permalink
updated to use twarc.ensure_flattened
Browse files Browse the repository at this point in the history
  • Loading branch information
edsu committed May 31, 2021
1 parent f2404d2 commit f88c3bb
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 21 deletions.
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="twarc-csv",
version="0.3.2",
version="0.3.3",
url="https://github.com/docnow/twarc-csv",
author="Igor Brigadir",
author_email="[email protected]",
Expand All @@ -15,7 +15,7 @@
long_description_content_type="text/markdown",
python_requires=">=3.3",
install_requires=[
"twarc>=2.0.12",
"twarc>=2.1.1",
"pandas>=1.2.3",
"more-itertools>=8.7.0",
"tqdm>=4.59.0",
Expand Down
25 changes: 6 additions & 19 deletions twarc_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from tqdm import tqdm
from collections import OrderedDict
from more_itertools import ichunked
from twarc.expansions import flatten
from twarc import ensure_flattened

log = logging.getLogger("twarc")

Expand Down Expand Up @@ -206,26 +206,13 @@ def _read_lines(self):
self.progress.update(self.infile.tell() - self.progress.n)
line = self.infile.readline()

def _handle_formats(self, batch):
def _generate_tweets(self, batch):
"""
Handle different types of json formats, generating 1 tweet at a time
a batch is a number of lines from a json,
these can be full pages of requests or individual tweets.
Generate flattened tweets from a batch.
"""
for item in batch:
# if it has a "data" key ensure data it is flattened
if "data" in item:
# flatten a list of tweets
if isinstance(item["data"], list):
for i in flatten(item)["data"]:
yield i
# flatten a single tweet, eg, from stream
else:
yield flatten(item)["data"]
else:
# this assumes the data is flattened
yield item
for tweet in ensure_flattened(item):
yield tweet

def _inline_referenced_tweets(self, tweet):
"""
Expand Down Expand Up @@ -315,7 +302,7 @@ def _process_batch(self, batch):
# (Optional) append referenced tweets as new rows
tweet_batch = itertools.chain.from_iterable(
self._process_tweets(self._inline_referenced_tweets(tweet))
for tweet in self._handle_formats(batch)
for tweet in self._generate_tweets(batch)
)

_df = pd.json_normalize(list(tweet_batch), errors="ignore")
Expand Down

0 comments on commit f88c3bb

Please sign in to comment.