Skip to content

Commit

Permalink
tmp
Browse files Browse the repository at this point in the history
  • Loading branch information
yu23ki14 committed May 15, 2024
1 parent a82d7dc commit 0a27638
Show file tree
Hide file tree
Showing 12 changed files with 89 additions and 95 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ downloads/
eggs/
.eggs/
lib/
!etl/src/lib/
lib64/
parts/
sdist/
Expand Down
30 changes: 30 additions & 0 deletions common/birdxplorer_common/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
UserEnrollment,
UserId,
UserName,
BinaryBool,
NotesClassification,
NotesHarmful,
NotesValidationDifficulty
)
from .models import XUser as XUserModel
from .settings import GlobalSettings
Expand Down Expand Up @@ -100,6 +104,32 @@ class PostRecord(Base):
repost_count: Mapped[NonNegativeInt] = mapped_column(nullable=False)
impression_count: Mapped[NonNegativeInt] = mapped_column(nullable=False)

class RowNoteRecord(Base):
__tablename__ = "row_notes"

note_id: Mapped[NoteId] = mapped_column(primary_key=True)
note_author_participant_id: Mapped[ParticipantId] = mapped_column(nullable=False)
created_at_millis: Mapped[TwitterTimestamp] = mapped_column(nullable=False)
tweet_id: Mapped[TweetId] = mapped_column(nullable=False)
believable: Mapped[BinaryBool] = mapped_column(nullable=False)
misleading_other: Mapped[BinaryBool] = mapped_column(nullable=False)
misleading_factual_error: Mapped[BinaryBool] = mapped_column(nullable=False)
misleading_manipulated_media: Mapped[BinaryBool] = mapped_column(nullable=False)
misleading_outdated_information: Mapped[BinaryBool] = mapped_column(nullable=False)
misleading_missing_important_context: Mapped[BinaryBool] = mapped_column(nullable=False)
misleading_unverified_claim_as_fact: Mapped[BinaryBool] = mapped_column(nullable=False)
misleading_satire: Mapped[BinaryBool] = mapped_column(nullable=False)
not_misleading_other: Mapped[BinaryBool] = mapped_column(nullable=False)
not_misleading_factually_correct: Mapped[BinaryBool] = mapped_column(nullable=False)
not_misleading_outdated_but_not_when_written: Mapped[BinaryBool] = mapped_column(nullable=False)
not_misleading_clearly_satire: Mapped[BinaryBool] = mapped_column(nullable=False)
not_misleading_personal_opinion: Mapped[BinaryBool] = mapped_column(nullable=False)
trustworthy_sources: Mapped[BinaryBool] = mapped_column(nullable=False)
is_media_note: Mapped[BinaryBool] = mapped_column(nullable=False)
classification: Mapped[NotesClassification] = mapped_column(nullable=False)
harmful: Mapped[NotesHarmful] = mapped_column(nullable=False)
validation_difficulty: Mapped[SummaryString] = mapped_column(nullable=False)
summary: Mapped[SummaryString] = mapped_column(nullable=False)

class Storage:
def __init__(self, engine: Engine) -> None:
Expand Down
1 change: 0 additions & 1 deletion etl/birdxplorer_etl/__init__.py

This file was deleted.

40 changes: 0 additions & 40 deletions etl/birdxplorer_etl/extract.py

This file was deleted.

5 changes: 0 additions & 5 deletions etl/birdxplorer_etl/load.py

This file was deleted.

24 changes: 0 additions & 24 deletions etl/birdxplorer_etl/main.py

This file was deleted.

5 changes: 0 additions & 5 deletions etl/birdxplorer_etl/transform.py

This file was deleted.

18 changes: 3 additions & 15 deletions etl/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,36 +5,24 @@ build-backend = "setuptools.build_meta"
[project]
name = "birdxplorer_etl"
version = "0.1.0"
description = "BirdXplorer ETL is data extraction, t l "
description = "BirdXplorer ETL is data extraction"
authors = [
{name = "osoken"},
{name = "yu23ki14"}
]
dynamic = [
"version",
]
dependencies = [
"birdxplorer_common @ git+https://github.com/codeforjapan/BirdXplorer.git@feature/issue-53-divide-python-packages#subdirectory=common",
"pandas",
"sqlalchemy",
"requests",
"pytest",
"prefect"
"prefect",
"stringcase"
]

[project.urls]
Source = "https://github.com/codeforjapan/BirdXplorer"

[tool.setuptools]
packages=[
"birdxplorer_etl",
"pandas",
"sqlalchemy",
"requests",
"pytest",
"stringcase"
]

[tool.setuptools.package-data]
birdxplorer = ["py.typed"]

Expand Down
43 changes: 38 additions & 5 deletions etl/src/extract.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,45 @@
import birdxplorer_common.models
from prefect import get_run_logger
import requests
from datetime import datetime, timedelta
import csv
import birdxplorer_common
from typing import List
import stringcase
import settings
from lib.sqlite.init import init_db

def extract_data():
logger = get_run_logger()
logger.info("Hello")
url = 'https://ton.twimg.com/birdwatch-public-data/2024/04/22/notes/notes-00000.tsv'
res = requests.get(url)
with open('./data/notes.tsv', 'w') as f:
f.write(res.content.decode('utf-8'))
logger.info("Downloading community notes data")

db = init_db()

# 最新のNoteデータを取得
date = datetime.now()
while True:
url = f'https://ton.twimg.com/birdwatch-public-data/{date.strftime("%Y/%m/%d")}/notes/notes-00000.tsv'
logger.info(url)
res = requests.get(url)
if res.status_code == 200:
# res.contentをdbのNoteテーブル
tsv_data = res.content.decode('utf-8').splitlines()
reader = csv.DictReader(tsv_data, delimiter='\t')
reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]

for row in reader:
db.add(row)
break
date = date - timedelta(days=1)

db.commit()

db.query(birdxplorer_common.models.Note).first()

# # Noteに紐づくtweetデータを取得
# for note in notes_data:
# note_created_at = note.created_at_millis.serialize()
# if note_created_at >= settings.TARGET_TWITTER_POST_START_UNIX_MILLISECOND and note_created_at <= settings.TARGET_TWITTER_POST_END_UNIX_MILLISECOND:
# tweet_id = note.tweet_id.serialize()
# continue
return
17 changes: 17 additions & 0 deletions etl/src/lib/sqlite/init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Create Note table for sqlite with columns: id, title, content, created_at, updated_at by sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import os
from prefect import get_run_logger
from birdxplorer_common.storage import Row

def init_db():
logger = get_run_logger()

db_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..', 'data', 'note.db'))
logger.info(f'Initializing database at {db_path}')
engine = create_engine('sqlite:///' + db_path)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)

return Session()
File renamed without changes.
File renamed without changes.

0 comments on commit 0a27638

Please sign in to comment.