Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import and link campaigns #180

Merged
merged 5 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ __pycache__
*.pyc
/static
*.csv
.env
.env
!_data/raw/offices.csv
12 changes: 9 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,15 @@ import/% : s3/$(AWS_STORAGE_BUCKET_NAME)/%.gz
--year $(word 2, $(subst _, , $*))

import/local/% : _data/raw/%.csv
python manage.py import_api_data --transaction-type $(word 1, $(subst _, , $*)) \
--year $(word 2, $(subst _, , $*)) \
--file $<
python manage.py import_api_data --transaction-type $(word 1, $(subst _, , $*)) \
--year $(word 2, $(subst _, , $*)) \
--file $<

import/offices :
python manage.py import_office_api_data

import/local/offices : _data/raw/offices.csv
python manage.py import_office_api_data --file $<

s3/$(AWS_STORAGE_BUCKET_NAME)/%.gz : %.gz
aws s3 cp $< s3://$$AWS_STORAGE_BUCKET_NAME
Expand Down
1,029 changes: 1,029 additions & 0 deletions _data/raw/offices.csv

Large diffs are not rendered by default.

80 changes: 39 additions & 41 deletions camp_fin/management/commands/import_api_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,14 @@ def make_pac(self, record, entity_type=None):
)

def make_filing(self, record):
if not any(
(
self.parse_date(record["Filed Date"]),
self.parse_date(record["End of Period"]),
)
):
raise ValueError

if record["Report Entity Type"] == "Candidate":
# Create PAC associated with candidate
self.make_pac(record, entity_type="Political Committee")
Expand Down Expand Up @@ -447,51 +455,41 @@ def make_filing(self, record):
)
raise ValueError

# This is fudged and should be drawn instead from a canonical list of offices and races.
# We need a campaign for committee finances to be associated with the candidate.
election_year = self.parse_date(record["Start of Period"]).year

election_season = self.fetch_from_cache(
"election_season",
(election_year, False, 0),
models.ElectionSeason,
dict(
year=self.parse_date(record["Start of Period"]).year,
special=False,
status_id=0,
),
transaction_date = (
record["Transaction Date"]
if "Transaction Date" in record
else record["Expenditure Date"]
)

office = self.fetch_from_cache(
"office",
None,
models.Office,
dict(description="Not specified", status_id=0),
)
transaction_year = self.parse_date(transaction_date).year

party = self.fetch_from_cache(
"party",
None,
models.PoliticalParty,
dict(name="Not specified"),
)
try:
campaign = self._cache["campaign"][
(candidate.full_name, transaction_year)
]

campaign = self.fetch_from_cache(
"campaign",
(
record["Committee Name"],
candidate.full_name,
election_season.year,
),
models.Campaign,
dict(
committee_name=record["Committee Name"],
candidate=candidate,
election_season=election_season,
office=office,
political_party=party,
),
)
except KeyError:
try:
campaign = candidate.campaign_set.get(
election_season__year=transaction_year
)
except models.Campaign.MultipleObjectsReturned:
campaign = candidate.campaign_set.filter(
election_season__year=transaction_year
).first()
except models.Campaign.DoesNotExist:
self.stderr.write(
f"Could not find campaign for {candidate} in {transaction_year}"
)
campaign = None

if campaign and campaign.committee_name != record["Committee Name"]:
campaign.committee_name = record["Committee Name"]
campaign.save()

self._cache["campaign"][
(candidate.full_name, transaction_year)
] = campaign

filing_kwargs = {"entity": entity, "campaign": campaign}

Expand Down
270 changes: 270 additions & 0 deletions camp_fin/management/commands/import_office_api_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
import csv
import gzip
import os
import re

from django.core.management.base import BaseCommand
from django.db.models import Max
from django.db.utils import IntegrityError
from django.utils.text import slugify

import boto3
from tqdm import tqdm
import probablepeople

from camp_fin import models


class Command(BaseCommand):
help = """
Import data from the New Mexico Campaign Finance System:
https://github.com/datamade/nmid-scrapers/pull/2

Data will be retrieved from S3 unless a local CSV is specified as --file
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

try:
self._next_entity_id = (
models.Entity.objects.aggregate(max_id=Max("user_id"))["max_id"] + 1
)
except TypeError:
self._next_entity_id = 1

self._cache = {
"office": {
obj.description: obj for obj in models.Office.objects.iterator()
},
"office_type": {
obj.description: obj for obj in models.OfficeType.objects.iterator()
},
"political_party": {
obj.name: obj for obj in models.PoliticalParty.objects.iterator()
},
"district": {
(obj.name, obj.office.description): obj
for obj in models.District.objects.iterator()
},
"county": {obj.name: obj for obj in models.County.objects.iterator()},
"election_season": {
(obj.year, obj.special): obj
for obj in models.ElectionSeason.objects.iterator()
},
"candidate": {},
"entity_type": {
obj.description: obj for obj in models.EntityType.objects.iterator()
},
}

def add_arguments(self, parser):
parser.add_argument(
"--file",
dest="file",
help="Absolute path of CSV file to import",
required=False,
)

def handle(self, *args, **options):
if options["file"]:
f = open(options["file"], "r")

else:
s3 = boto3.client("s3")

resource_name = "offices.gz"

with open(resource_name, "wb") as download_location:
s3.download_fileobj(
os.getenv("AWS_STORAGE_BUCKET_NAME", "openness-project-nmid"),
resource_name,
download_location,
)

f = gzip.open(resource_name, "rt")

try:
reader = csv.DictReader(f)

candidates_created = 0
candidates_linked = 0
candidates_skipped = 0
campaigns = []

for record in tqdm(reader):
name_parts = record["CandidateName"].split(",")

try:
candidate_name, _ = probablepeople.tag(
" ".join(
[name_parts[1], name_parts[0], " ".join(name_parts[2:])]
)
)
except probablepeople.RepeatedLabelError:
self.stderr.write(
f"Could not parse candidate name {record['CandidateName']}. Skipping..."
)
candidates_skipped += 1
continue

full_name = re.sub(
r"\s{2,}",
" ",
" ".join(
[
candidate_name.get("GivenName", ""),
candidate_name.get("MiddleName", "")
or candidate_name.get("MiddleInitial", ""),
candidate_name.get("Surname", ""),
candidate_name.get("SuffixGenerational", "")
or candidate_name.get("SuffixOther", ""),
]
),
).strip()

try:
candidate = self.fetch_from_cache(
"candidate", full_name, models.Candidate, {}, create=False
)
candidates_linked += 1
except KeyError:
try:
candidate = models.Candidate.objects.get(full_name=full_name)
self._cache["candidate"][candidate.full_name] = candidate
candidates_linked += 1

except models.Candidate.MultipleObjectsReturned:
candidate = models.Candidate.objects.filter(
full_name=full_name
).first()
self._cache["candidate"][candidate.full_name] = candidate
candidates_linked += 1

except models.Candidate.DoesNotExist:
entity_type = self.fetch_from_cache(
"entity_type",
"Candidate",
models.EntityType,
{"description": "Candidate"},
)

entity = models.Entity.objects.create(
user_id=self._next_entity_id,
entity_type=entity_type,
)

candidate = self.fetch_from_cache(
"candidate",
full_name,
models.Candidate,
dict(
first_name=candidate_name.get("GivenName", None),
middle_name=candidate_name.get("MiddleName", None)
or candidate_name.get("MiddleInitial", None),
last_name=candidate_name.get("Surname", None),
suffix=candidate_name.get("SuffixGenerational", None)
or candidate_name.get("SuffixOther", None),
full_name=full_name,
slug=slugify(
" ".join(
[
candidate_name.get("GivenName", ""),
candidate_name.get("Surname", ""),
]
)
),
entity=entity,
),
)

self._next_entity_id += 1

candidates_created += 1

election_year = re.match(r"\d{4}", record["ElectionName"]).group(0)

election_season = self.fetch_from_cache(
"election_season",
(election_year, False),
models.ElectionSeason,
{"year": election_year, "special": False, "status_id": 0},
)
office = self.fetch_from_cache(
"office",
record["OfficeName"],
models.Office,
{"description": record["OfficeName"], "status_id": 0},
)
office_type = self.fetch_from_cache(
"office_type",
record["JurisdictionType"],
models.OfficeType,
{"description": record["JurisdictionType"]},
)
political_party = self.fetch_from_cache(
"political_party",
record["Party"],
models.PoliticalParty,
{"name": record["Party"]},
)
district = self.fetch_from_cache(
"district",
(record["District"], office.description),
models.District,
{"name": record["District"], "office": office, "status_id": 0},
)

if record["Jurisdiction"]:
county = self.fetch_from_cache(
"county",
record["Jurisdiction"],
models.County,
{"name": record["Jurisdiction"]},
)
else:
county = None

campaigns.append(
models.Campaign(
election_season=election_season,
candidate=candidate,
office=office,
district=district,
county=county,
political_party=political_party,
)
)

models.Campaign.objects.filter(election_season__year__gte=2021).delete()
models.Campaign.objects.bulk_create(campaigns)

finally:
f.close()

self.stderr.write(
f"Linked {candidates_linked} candidates with a campaign, created {candidates_created} candidates, skipped {candidates_skipped} candidates"
)

def fetch_from_cache(
self, cache_entity, cache_key, model, model_kwargs, create=True
):
try:
return self._cache[cache_entity][cache_key]
except KeyError:
if not create:
raise

deidentified_model_kwargs = {
k: v for k, v in model_kwargs.items() if k not in ("entity", "slug")
}

try:
obj = model.objects.get(**deidentified_model_kwargs)
except model.DoesNotExist:
obj = model.objects.create(**model_kwargs)
except model.MultipleObjectsReturned:
obj = model.objects.filter(**deidentified_model_kwargs).first()

self._cache[cache_entity][cache_key] = obj
return obj
Loading