datamade · hancush · Jan 19, 2024 · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,5 @@ __pycache__
 *.pyc
 /static
 *.csv
-.env
+.env
+!_data/raw/offices.csv
diff --git a/Makefile b/Makefile
@@ -5,9 +5,15 @@ import/% : s3/$(AWS_STORAGE_BUCKET_NAME)/%.gz
  --year $(word 2, $(subst _, , $*))
 
 import/local/% : _data/raw/%.csv
- python manage.py import_api_data --transaction-type $(word 1, $(subst _, , $*)) \
- --year $(word 2, $(subst _, , $*)) \
- --file $<
+ python manage.py import_api_data --transaction-type $(word 1, $(subst _, , $*)) \
+ --year $(word 2, $(subst _, , $*)) \
+ --file $<
+
+import/offices :
+ python manage.py import_office_api_data
+
+import/local/offices : _data/raw/offices.csv
+ python manage.py import_office_api_data --file $<
 
 s3/$(AWS_STORAGE_BUCKET_NAME)/%.gz : %.gz
  aws s3 cp $< s3://$$AWS_STORAGE_BUCKET_NAME

diff --git a/_data/raw/offices.csv b/_data/raw/offices.csv
diff --git a/camp_fin/management/commands/import_api_data.py b/camp_fin/management/commands/import_api_data.py
@@ -356,6 +356,14 @@ def make_pac(self, record, entity_type=None):
  )
 
  def make_filing(self, record):
+ if not any(
+ (
+ self.parse_date(record["Filed Date"]),
+ self.parse_date(record["End of Period"]),
+ )
+ ):
+ raise ValueError
+
  if record["Report Entity Type"] == "Candidate":
  # Create PAC associated with candidate
  self.make_pac(record, entity_type="Political Committee")
@@ -447,51 +455,41 @@ def make_filing(self, record):
  )
  raise ValueError
 
- # This is fudged and should be drawn instead from a canonical list of offices and races.
- # We need a campaign for committee finances to be associated with the candidate.
- election_year = self.parse_date(record["Start of Period"]).year
-
- election_season = self.fetch_from_cache(
- "election_season",
- (election_year, False, 0),
- models.ElectionSeason,
- dict(
- year=self.parse_date(record["Start of Period"]).year,
- special=False,
- status_id=0,
- ),
+ transaction_date = (
+ record["Transaction Date"]
+ if "Transaction Date" in record
+ else record["Expenditure Date"]
  )
 
- office = self.fetch_from_cache(
- "office",
- None,
- models.Office,
- dict(description="Not specified", status_id=0),
- )
+ transaction_year = self.parse_date(transaction_date).year
 
- party = self.fetch_from_cache(
- "party",
- None,
- models.PoliticalParty,
- dict(name="Not specified"),
- )
+ try:
+ campaign = self._cache["campaign"][
+ (candidate.full_name, transaction_year)
+ ]
 
- campaign = self.fetch_from_cache(
- "campaign",
- (
- record["Committee Name"],
- candidate.full_name,
- election_season.year,
- ),
- models.Campaign,
- dict(
- committee_name=record["Committee Name"],
- candidate=candidate,
- election_season=election_season,
- office=office,
- political_party=party,
- ),
- )
+ except KeyError:
+ try:
+ campaign = candidate.campaign_set.get(
+ election_season__year=transaction_year
+ )
+ except models.Campaign.MultipleObjectsReturned:
+ campaign = candidate.campaign_set.filter(
+ election_season__year=transaction_year
+ ).first()
+ except models.Campaign.DoesNotExist:
+ self.stderr.write(
+ f"Could not find campaign for {candidate} in {transaction_year}"
+ )
+ campaign = None
+
+ if campaign and campaign.committee_name != record["Committee Name"]:
+ campaign.committee_name = record["Committee Name"]
+ campaign.save()
+
+ self._cache["campaign"][
+ (candidate.full_name, transaction_year)
+ ] = campaign
 
  filing_kwargs = {"entity": entity, "campaign": campaign}
 

diff --git a/camp_fin/management/commands/import_office_api_data.py b/camp_fin/management/commands/import_office_api_data.py
@@ -0,0 +1,270 @@
+import csv
+import gzip
+import os
+import re
+
+from django.core.management.base import BaseCommand
+from django.db.models import Max
+from django.db.utils import IntegrityError
+from django.utils.text import slugify
+
+import boto3
+from tqdm import tqdm
+import probablepeople
+
+from camp_fin import models
+
+
+class Command(BaseCommand):
+ help = """
+ Import data from the New Mexico Campaign Finance System:
+ https://github.com/datamade/nmid-scrapers/pull/2
+
+ Data will be retrieved from S3 unless a local CSV is specified as --file
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ try:
+ self._next_entity_id = (
+ models.Entity.objects.aggregate(max_id=Max("user_id"))["max_id"] + 1
+ )
+ except TypeError:
+ self._next_entity_id = 1
+
+ self._cache = {
+ "office": {
+ obj.description: obj for obj in models.Office.objects.iterator()
+ },
+ "office_type": {
+ obj.description: obj for obj in models.OfficeType.objects.iterator()
+ },
+ "political_party": {
+ obj.name: obj for obj in models.PoliticalParty.objects.iterator()
+ },
+ "district": {
+ (obj.name, obj.office.description): obj
+ for obj in models.District.objects.iterator()
+ },
+ "county": {obj.name: obj for obj in models.County.objects.iterator()},
+ "election_season": {
+ (obj.year, obj.special): obj
+ for obj in models.ElectionSeason.objects.iterator()
+ },
+ "candidate": {},
+ "entity_type": {
+ obj.description: obj for obj in models.EntityType.objects.iterator()
+ },
+ }
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ "--file",
+ dest="file",
+ help="Absolute path of CSV file to import",
+ required=False,
+ )
+
+ def handle(self, *args, **options):
+ if options["file"]:
+ f = open(options["file"], "r")
+
+ else:
+ s3 = boto3.client("s3")
+
+ resource_name = "offices.gz"
+
+ with open(resource_name, "wb") as download_location:
+ s3.download_fileobj(
+ os.getenv("AWS_STORAGE_BUCKET_NAME", "openness-project-nmid"),
+ resource_name,
+ download_location,
+ )
+
+ f = gzip.open(resource_name, "rt")
+
+ try:
+ reader = csv.DictReader(f)
+
+ candidates_created = 0
+ candidates_linked = 0
+ candidates_skipped = 0
+ campaigns = []
+
+ for record in tqdm(reader):
+ name_parts = record["CandidateName"].split(",")
+
+ try:
+ candidate_name, _ = probablepeople.tag(
+ " ".join(
+ [name_parts[1], name_parts[0], " ".join(name_parts[2:])]
+ )
+ )
+ except probablepeople.RepeatedLabelError:
+ self.stderr.write(
+ f"Could not parse candidate name {record['CandidateName']}. Skipping..."
+ )
+ candidates_skipped += 1
+ continue
+
+ full_name = re.sub(
+ r"\s{2,}",
+ " ",
+ " ".join(
+ [
+ candidate_name.get("GivenName", ""),
+ candidate_name.get("MiddleName", "")
+ or candidate_name.get("MiddleInitial", ""),
+ candidate_name.get("Surname", ""),
+ candidate_name.get("SuffixGenerational", "")
+ or candidate_name.get("SuffixOther", ""),
+ ]
+ ),
+ ).strip()
+
+ try:
+ candidate = self.fetch_from_cache(
+ "candidate", full_name, models.Candidate, {}, create=False
+ )
+ candidates_linked += 1
+ except KeyError:
+ try:
+ candidate = models.Candidate.objects.get(full_name=full_name)
+ self._cache["candidate"][candidate.full_name] = candidate
+ candidates_linked += 1
+
+ except models.Candidate.MultipleObjectsReturned:
+ candidate = models.Candidate.objects.filter(
+ full_name=full_name
+ ).first()
+ self._cache["candidate"][candidate.full_name] = candidate
+ candidates_linked += 1
+
+ except models.Candidate.DoesNotExist:
+ entity_type = self.fetch_from_cache(
+ "entity_type",
+ "Candidate",
+ models.EntityType,
+ {"description": "Candidate"},
+ )
+
+ entity = models.Entity.objects.create(
+ user_id=self._next_entity_id,
+ entity_type=entity_type,
+ )
+
+ candidate = self.fetch_from_cache(
+ "candidate",
+ full_name,
+ models.Candidate,
+ dict(
+ first_name=candidate_name.get("GivenName", None),
+ middle_name=candidate_name.get("MiddleName", None)
+ or candidate_name.get("MiddleInitial", None),
+ last_name=candidate_name.get("Surname", None),
+ suffix=candidate_name.get("SuffixGenerational", None)
+ or candidate_name.get("SuffixOther", None),
+ full_name=full_name,
+ slug=slugify(
+ " ".join(
+ [
+ candidate_name.get("GivenName", ""),
+ candidate_name.get("Surname", ""),
+ ]
+ )
+ ),
+ entity=entity,
+ ),
+ )
+
+ self._next_entity_id += 1
+
+ candidates_created += 1
+
+ election_year = re.match(r"\d{4}", record["ElectionName"]).group(0)
+
+ election_season = self.fetch_from_cache(
+ "election_season",
+ (election_year, False),
+ models.ElectionSeason,
+ {"year": election_year, "special": False, "status_id": 0},
+ )
+ office = self.fetch_from_cache(
+ "office",
+ record["OfficeName"],
+ models.Office,
+ {"description": record["OfficeName"], "status_id": 0},
+ )
+ office_type = self.fetch_from_cache(
+ "office_type",
+ record["JurisdictionType"],
+ models.OfficeType,
+ {"description": record["JurisdictionType"]},
+ )
+ political_party = self.fetch_from_cache(
+ "political_party",
+ record["Party"],
+ models.PoliticalParty,
+ {"name": record["Party"]},
+ )
+ district = self.fetch_from_cache(
+ "district",
+ (record["District"], office.description),
+ models.District,
+ {"name": record["District"], "office": office, "status_id": 0},
+ )
+
+ if record["Jurisdiction"]:
+ county = self.fetch_from_cache(
+ "county",
+ record["Jurisdiction"],
+ models.County,
+ {"name": record["Jurisdiction"]},
+ )
+ else:
+ county = None
+
+ campaigns.append(
+ models.Campaign(
+ election_season=election_season,
+ candidate=candidate,
+ office=office,
+ district=district,
+ county=county,
+ political_party=political_party,
+ )
+ )
+
+ models.Campaign.objects.filter(election_season__year__gte=2021).delete()
+ models.Campaign.objects.bulk_create(campaigns)
+
+ finally:
+ f.close()
+
+ self.stderr.write(
+ f"Linked {candidates_linked} candidates with a campaign, created {candidates_created} candidates, skipped {candidates_skipped} candidates"
+ )
+
+ def fetch_from_cache(
+ self, cache_entity, cache_key, model, model_kwargs, create=True
+ ):
+ try:
+ return self._cache[cache_entity][cache_key]
+ except KeyError:
+ if not create:
+ raise
+
+ deidentified_model_kwargs = {
+ k: v for k, v in model_kwargs.items() if k not in ("entity", "slug")
+ }
+
+ try:
+ obj = model.objects.get(**deidentified_model_kwargs)
+ except model.DoesNotExist:
+ obj = model.objects.create(**model_kwargs)
+ except model.MultipleObjectsReturned:
+ obj = model.objects.filter(**deidentified_model_kwargs).first()
+
+ self._cache[cache_entity][cache_key] = obj
+ return obj