Merge pull request #6 from internetstandards/domain_normalization

expect datasources to contain uppercase letters in source data
internetstandards · Nov 7, 2024 · 80f0aad · 80f0aad
2 parents af119b2 + 8f11fe0
commit 80f0aad
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 37 deletions.
diff --git a/src/ctlssa/suggestions/logic/domains.py b/src/ctlssa/suggestions/logic/domains.py
@@ -23,6 +23,9 @@ def add_domains(all_domains: List[str]) -> int:
         # prevent double requests and other behavior that causes redundancies to create duplicate records
         # this saves a TON of space. Run this first, before extracting the domain, a small optimization.
 
+        # some normalization as a data source might be polluted, merklemap is and there might be an off-day in ct too
+        domain = domain.lower().strip()
+
         # Warning: in bulk operations the deque is EXTREMELY slow. A longer deque with 100k records will slow down
         # imports to a near halt. So assume in bulk-imports that this deque is not needed.
         if domain in recently_added:
@@ -87,6 +90,9 @@ def __init__(self):
     def add_domain(self, domain: str, processing_date: date):
         # wildcards do not exist in the hostname field, so no need to filter.
 
+        # some normalization as a data source might be polluted, merklemap is and there might be an off-day in ct too
+        domain = domain.lower().strip()
+
         # we can use this shortcut as there are no two level top level domains in the dutch zones
         # the partition method is the fastest:
         rest, delimiter, suffix = domain.rpartition(".")