Skip to content

Commit

Permalink
More aggressive caching by recency
Browse files Browse the repository at this point in the history
  • Loading branch information
amywieliczka committed Oct 17, 2024
1 parent 3c15a9b commit 92e79b9
Showing 1 changed file with 45 additions and 23 deletions.
68 changes: 45 additions & 23 deletions content_harvester/by_record.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import hashlib
import os
from datetime import datetime
from datetime import datetime, timedelta
from typing import Optional, Any, Tuple
from functools import lru_cache
from dataclasses import dataclass, asdict
Expand Down Expand Up @@ -226,36 +226,58 @@ def check_component_cache(
):
print(
f"{component_type}: No ETag or Last-Modified headers, "
"skipping cache check"
"checking cache and judging cache hit based on URL and "
"date since content component creation"
)
# always a cache miss, also don't cache this
component, tmp_files = create_component(collection_id, request, *args, **kwargs)
return {**component, 'from-cache': False}, tmp_files

# Create cache key
cache_key = '/'.join([
str(collection_id),
quote_plus(request.url),
component_type,
quote_plus(head_resp.headers.get('ETag', '')),
quote_plus(head_resp.headers.get('Last-Modified', ''))
])
# Create cache key without ETag or Last-Modified
cache_key = '/'.join([
str(collection_id),
quote_plus(request.url),
component_type
])
else:
# Create specific cache key
cache_key = '/'.join([
str(collection_id),
quote_plus(request.url),
component_type,
quote_plus(head_resp.headers.get('ETag', '')),
quote_plus(head_resp.headers.get('Last-Modified', ''))
])

# Check cache for component or create component
component = persistent_cache.get(cache_key)
if component:
date_component_created = (component
['component_content_harvest_metadata']
['date_content_component_created']
)
print(
f"Retrieved {component_type} component from cache for "
f"Retrieved {component_type} component created on "
f"{date_component_created} from cache for "
f"{request.url}"
)
return {**component, 'from-cache': True}, []
else:
component, tmp_files = create_component(
collection_id, request, *args, **kwargs)
print(f"Created {component_type} component for {request.url}")
# set cache key to the component
persistent_cache[cache_key] = component
return {**component, 'from-cache': False}, tmp_files
# the cache key is specific with etag and/or last-modified
if len(cache_key.split('/')) == 5:
return {**component, 'from-cache': True}, []

# the cache key is not specific, so check if the cache hit
# was created within the last week
date_component_created = datetime.fromisoformat(
date_component_created)
today = datetime.today()
seven_days_ago = today - timedelta(days=7)
if seven_days_ago <= date_component_created <= today:
return {**component, 'from-cache': True}, []

print("Cache hit is older than 7 days, re-creating component")

component, tmp_files = create_component(
collection_id, request, *args, **kwargs)
print(f"Created {component_type} component for {request.url}")
# set cache key to the component
persistent_cache[cache_key] = component
return {**component, 'from-cache': False}, tmp_files

return check_component_cache
return inner
Expand Down

0 comments on commit 92e79b9

Please sign in to comment.