Skip to content

Commit

Permalink
#161 Disable embedding creation: Commented out content_chunks creation
Browse files Browse the repository at this point in the history
  • Loading branch information
m-i-l committed Nov 3, 2024
1 parent 800cf4d commit 9a6e707
Showing 1 changed file with 11 additions and 11 deletions.
22 changes: 11 additions & 11 deletions src/indexing/indexer/spiders/search_my_site_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,11 +295,11 @@ def customparser(response, domain, is_home, domains_for_indexed_links, site_conf

# content_chunks (pseudo-field for nested documents)
# Get values from the previously indexed version of this page
previous_content_chunks = None
if response.url in previous_contents: # previous_contents already defined above
previous_page = previous_contents[response.url]
if 'content_chunks' in previous_page:
previous_content_chunks = previous_page['content_chunks']
#previous_content_chunks = None
#if response.url in previous_contents: # previous_contents already defined above
# previous_page = previous_contents[response.url]
# if 'content_chunks' in previous_page:
# previous_content_chunks = previous_page['content_chunks']
# Scenarios:
# 1. Content has changed, or content is new, or there aren't any existing content_chunks (e.g. on first run) - regenerate
# 2. Else reuse previous values
Expand All @@ -309,12 +309,12 @@ def customparser(response, domain, is_home, domains_for_indexed_links, site_conf
# has changed. That may be fine for minor embedding config changes, e.g. a change to the chunk length, but could be breaking
# for significant embedding config changes, e.g. if the embedding model is changed. Suggestion in the case of significant
# config changes is to delete all embeddings, e.g. via <delete><query>relationship:child</query></delete> .
if (previous_content and new_content and previous_content != new_content) or (new_content and not previous_content) or (not previous_content_chunks):
content_chunks = get_content_chunks(content_text, site_config['content_chunks_limit'], item['id'], item['url'], domain)
else:
logger.debug("Reusing existing embeddings for {}".format(item['id']))
content_chunks = previous_content_chunks
item['content_chunks'] = content_chunks
#if (previous_content and new_content and previous_content != new_content) or (new_content and not previous_content) or (not previous_content_chunks):
# content_chunks = get_content_chunks(content_text, site_config['content_chunks_limit'], item['id'], item['url'], domain)
#else:
# logger.debug("Reusing existing embeddings for {}".format(item['id']))
# content_chunks = previous_content_chunks
#item['content_chunks'] = content_chunks

# published_date
published_date = response.xpath('//meta[@property="article:published_time"]/@content').get()
Expand Down

0 comments on commit 9a6e707

Please sign in to comment.