fivetran · fivetran-reneeli · Nov 20, 2024 · Nov 20, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,14 @@
+# dbt_unified_rag v0.1.0-a4
+
+## Breaking Changes
+- Added the hubspot `engagement` source table to the package and made the following updates:
+    - Added `stg_rag_hubspot__engagement` model as part of the hubspot staging models.
+    - Updated `int_rag_hubspot__deal_document` to adjust the method that `hubspot_engagement_*` models are joined by leveraging the `hubspot__engagement` table as the intermediary joining table for the `engagement_contact` and `engagement_company` tables.
+    - Updated `int_rag_hubspot__deal_document` to retrieve `engagement_type` from the hubspot `engagement` table as opposed to the `engagement_emails` and `engagement_notes` tables. As such, removes their respective references as they are no longer used in this model.
+
+## Bug Fix (`--full-refresh` required when upgrading)
+- Updated the unique key in `rag__unified_document` to include `chunk_index`. Previously, the unique key was a combination of only `document_id`, `platform`, and `source_relation`, which was potentially inaccurate if there were multiple chunks associated with a document.
+
 # dbt_unified_rag v0.1.0-a3
 [PR #9](https://github.com/fivetran/dbt_unified_rag/pull/9) includes the following updates: 
 

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Unified RAG dbt Package ([Docs](https://fivetran.github.io/dbt_unified_rag/))
 
-<p align="center">
+<p align="left">
     <a alt="License"
         href="https://github.com/fivetran/dbt_unified_rag/blob/main/LICENSE">
         <img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" /></a>
@@ -46,7 +46,7 @@ Include the following package_display_name package version in your `packages.yml
 ```yml
 packages:
   - package: fivetran/unified_rag
-    version: 0.1.0-a3
+    version: 0.1.0-a4
 ```
 
 ### Step 3: Define database and schema variables

diff --git a/dbt_project.yml b/dbt_project.yml
@@ -19,6 +19,7 @@ vars:
     jira_priority: "{{ source('rag_jira', 'priority') }}"
 
     # Hubspot Sources
+    hubspot_engagement: "{{ source('rag_hubspot', 'engagement') }}"
     hubspot_engagement_note: "{{ source('rag_hubspot', 'engagement_note') }}"
     hubspot_engagement_email: "{{ source('rag_hubspot', 'engagement_email') }}"
     hubspot_engagement_company: "{{ source('rag_hubspot', 'engagement_company') }}"

diff --git a/docs/catalog.json b/docs/catalog.json
diff --git a/docs/index.html b/docs/index.html
diff --git a/docs/manifest.json b/docs/manifest.json
diff --git a/integration_tests/ci/sample.profiles.yml b/integration_tests/ci/sample.profiles.yml
@@ -16,13 +16,13 @@ integration_tests:
       pass: "{{ env_var('CI_REDSHIFT_DBT_PASS') }}"
       dbname: "{{ env_var('CI_REDSHIFT_DBT_DBNAME') }}"
       port: 5439
-      schema: rag_integration_tests_1
+      schema: rag_integration_tests_3
       threads: 8
     bigquery:
       type: bigquery
       method: service-account-json
       project: 'dbt-package-testing'
-      schema: rag_integration_tests_1
+      schema: rag_integration_tests_3
       threads: 8
       keyfile_json: "{{ env_var('GCLOUD_SERVICE_KEY') | as_native }}"
     snowflake:
@@ -33,7 +33,7 @@ integration_tests:
       role: "{{ env_var('CI_SNOWFLAKE_DBT_ROLE') }}"
       database: "{{ env_var('CI_SNOWFLAKE_DBT_DATABASE') }}"
       warehouse: "{{ env_var('CI_SNOWFLAKE_DBT_WAREHOUSE') }}"
-      schema: rag_integration_tests_1
+      schema: rag_integration_tests_3
       threads: 8
     postgres:
       type: postgres
@@ -42,13 +42,13 @@ integration_tests:
       pass: "{{ env_var('CI_POSTGRES_DBT_PASS') }}"
       dbname: "{{ env_var('CI_POSTGRES_DBT_DBNAME') }}"
       port: 5432
-      schema: rag_integration_tests_1
+      schema: rag_integration_tests_3
       threads: 8
     databricks:
       catalog: "{{ env_var('CI_DATABRICKS_DBT_CATALOG') }}"
       host: "{{ env_var('CI_DATABRICKS_DBT_HOST') }}"
       http_path: "{{ env_var('CI_DATABRICKS_DBT_HTTP_PATH') }}"
-      schema: rag_integration_tests_1
+      schema: rag_integration_tests_3
       threads: 2
       token: "{{ env_var('CI_DATABRICKS_DBT_TOKEN') }}"
       type: databricks
diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml
@@ -6,14 +6,15 @@ version: '0.1.0'
 profile: "integration_tests"
 
 vars:
-  rag_hubspot_schema: "rag_integration_tests_1"
-  rag_zendesk_schema: "rag_integration_tests_1"
-  rag_jira_schema: "rag_integration_tests_1"
+  rag_hubspot_schema: "rag_integration_tests_3"
+  rag_zendesk_schema: "rag_integration_tests_3"
+  rag_jira_schema: "rag_integration_tests_3"
 
   rag__using_jira: True
   rag__using_zendesk: True
   rag__using_hubspot: True
 
+  rag_hubspot_engagement_identifier: "hubspot_engagement"
   rag_hubspot_engagement_note_identifier: "hubspot_engagement_note"
   rag_hubspot_engagement_email_identifier: "hubspot_engagement_email"
   rag_hubspot_engagement_company_identifier: "hubspot_engagement_company"
@@ -32,7 +33,7 @@ vars:
   rag_zendesk_ticket_comment_identifier: "zendesk_ticket_comment"
   rag_zendesk_user_identifier: "zendesk_user"
 
-  document_max_tokens: 2000
+  document_max_tokens: 50
 
 seeds:
   rag_integration_tests:
@@ -98,6 +99,9 @@ seeds:
         _fivetran_synced: timestamp
         property_closedate: timestamp
         property_createdate: timestamp
+    hubspot_engagement:
+      +column_types:
+        id: "{{ 'int64' if target.type == 'bigquery' else 'bigint' }}"
     hubspot_engagement_company:
       +column_types:
         engagement_id: "{{ 'int64' if target.type == 'bigquery' else 'bigint' }}"

diff --git a/integration_tests/seeds/hubspot_engagement.csv b/integration_tests/seeds/hubspot_engagement.csv
@@ -0,0 +1,2 @@
+id,type,_fivetran_synced,portal_id
+19732910159,CALL,2023-06-08 23:22:38.270000,4703379
diff --git a/integration_tests/seeds/jira_comment.csv b/integration_tests/seeds/jira_comment.csv
@@ -1,4 +1,4 @@
 id,_fivetran_synced,author_id,body,created,is_public,issue_id,update_author_id,updated
-1,2020-11-12 12:20:53.148,1a,Hello,2020-11-10 19:19:41.224,true,10011,1a,2020-11-10 19:19:41.224
+1,2020-11-12 12:20:53.148,1a,The quick brown fox jumps over the lazy dog. This sentence uses every letter in the English alphabet. It is often used as a typing practice sentence. Repetition of this sentence will ensure a consistent length. The quick brown fox jumps over the lazy dog. This sentence uses every letter in the English alphabet. It is often used as a typing practice sentence. Repetition of this sentence will ensure a consistent length. The quick brown fox jumps over the lazy dog. This sentence uses every letter in the English alphabet. It is often used as a typing practice sentence. Repetition of this sentence will ensure a consistent length. The quick brown fox jumps over the lazy dog. This sentence uses every letter in the English alphabet. It is often used as a typing practice sentence. Repetition of this sentence will ensure a consistent length. The quick brown fox jumps over the lazy dog. This sentence uses every letter in the English alphabet. It is often used as a typing practice sentence.,2020-11-10 19:19:41.224,true,10011,1a,2020-11-10 19:19:41.224
 2,2020-11-10 19:21:48.619,1a,To Do to In Progress 6 days 22 hours 26 minutes ago In Progress to Done 3 days 16 hours 34 minutes ago,2020-11-07 02:45:38.717,true,10011,1a,2020-11-07 02:45:38.717
 3,2020-11-10 19:21:48.618,1a,Joined Sample Sprint 2 7 days 9 hours 10 minutes ago,2020-11-07 02:45:38.717,true,10011,1a,2020-11-07 02:45:38.717
diff --git a/macros/staging/hubspot/get_hubspot_engagement_columns.sql b/macros/staging/hubspot/get_hubspot_engagement_columns.sql
@@ -0,0 +1,16 @@
+{% macro get_hubspot_engagement_columns() %}
+
+{% set columns = [
+    {"name": "_fivetran_synced", "datatype": dbt.type_timestamp()},
+    {"name": "active", "datatype": "boolean", "alias": "is_active"},
+    {"name": "created_at", "datatype": dbt.type_timestamp(), "alias": "created_timestamp"},
+    {"name": "id", "datatype": dbt.type_int()},
+    {"name": "owner_id", "datatype": dbt.type_int()},
+    {"name": "portal_id", "datatype": dbt.type_int()},
+    {"name": "timestamp", "datatype": dbt.type_timestamp(), "alias": "occurred_timestamp"},
+    {"name": "type", "datatype": dbt.type_string(), "alias": "engagement_type"}
+] %}
+
+{{ return(columns) }}
+
+{% endmacro %}
diff --git a/models/intermediate/hubspot/int_rag_hubspot__deal_document.sql b/models/intermediate/hubspot/int_rag_hubspot__deal_document.sql
 {{ unified_rag.coalesce_cast(["engagement_emails.engagement_type", "engagement_notes.engagement_type", "'UNKNOWN'"], dbt.type_string()) }} as engagement_type, 
 {{ unified_rag.coalesce_cast(["engagement_emails.engagement_type", "engagement_notes.engagement_type", "'UNKNOWN'"], dbt.type_string()) }} as engagement_type, 
@@ -18,6 +18,11 @@ companies as (
     from {{ ref('stg_rag_hubspot__company') }}
 ), 
 
+engagements as (
+    select *
+    from {{ ref('stg_rag_hubspot__engagement') }}
+),
+
 engagement_companies as (
 
     select *
@@ -30,18 +35,6 @@ engagement_contacts as (
     from {{ ref('stg_rag_hubspot__engagement_contact') }}
 ),
 
-engagement_emails as (
-
-    select *
-    from {{ ref('stg_rag_hubspot__engagement_email') }} 
-),
-
-engagement_notes as (
-
-    select *
-    from {{ ref('stg_rag_hubspot__engagement_note') }}
-),
-
 engagement_deals as (
 
     select *
@@ -53,7 +46,7 @@ engagement_detail_prep as (
     select
         deals.deal_id,
         deals.deal_name,
-        {{ unified_rag.coalesce_cast(["engagement_emails.engagement_type", "engagement_notes.engagement_type", "'UNKNOWN'"], dbt.type_string()) }} as engagement_type,
+        {{ unified_rag.coalesce_cast(["engagements.engagement_type", "'UNKNOWN'"], dbt.type_string()) }} as engagement_type,
         {{ dbt.concat(["'https://app.hubspot.com/contacts'", "deals.portal_id", "'/record/0-3/'", "deals.deal_id"]) }} as url_reference,
         deals.source_relation,
         {{ unified_rag.coalesce_cast(["contacts.contact_name", "'UNKNOWN'"], dbt.type_string()) }} as contact_name,
@@ -64,24 +57,21 @@ engagement_detail_prep as (
     left join engagement_deals
         on deals.deal_id = engagement_deals.deal_id
         and deals.source_relation = engagement_deals.source_relation
+    left join engagements
+        on engagement_deals.engagement_id = engagements.engagement_id
+        and engagement_deals.source_relation = engagements.source_relation
     left join engagement_contacts
-        on engagement_deals.engagement_id = engagement_contacts.engagement_id 
-        and engagement_deals.source_relation = engagement_contacts.source_relation
+        on engagements.engagement_id = engagement_contacts.engagement_id 
+        and engagements.source_relation = engagement_contacts.source_relation
+    left join engagement_companies
+        on engagements.engagement_id = engagement_companies.engagement_id 
+        and engagements.source_relation = engagement_companies.source_relation
     left join contacts 
         on engagement_contacts.contact_id = contacts.contact_id
         and engagement_contacts.source_relation = contacts.source_relation
-    left join engagement_companies
-        on engagement_deals.engagement_id = engagement_companies.engagement_id 
-        and engagement_deals.source_relation = engagement_companies.source_relation
     left join companies
         on engagement_companies.company_id = companies.company_id
         and engagement_companies.source_relation = companies.source_relation
-    left join engagement_emails
-        on engagement_deals.engagement_id = engagement_emails.engagement_id
-        and engagement_deals.source_relation = engagement_emails.source_relation
-    left join engagement_notes
-        on engagement_deals.engagement_id = engagement_notes.engagement_id
-        and engagement_deals.source_relation = engagement_notes.source_relation
 ), 
 
 engagement_details as (

diff --git a/models/rag__unified_document.sql b/models/rag__unified_document.sql
@@ -16,7 +16,7 @@
 {% for platform in enabled_variables %}
     {% if var(platform) == true -%}
         {%- set platform_name = platform | replace('rag__using_', '') -%}
-        {%- set unique_key_fields = ['document_id', 'platform', 'source_relation'] -%}
+        {%- set unique_key_fields = ['document_id', 'platform', 'chunk_index', 'source_relation'] -%}
         {% set select_statement = (
         "select \n" ~
         "   " ~ dbt_utils.generate_surrogate_key(unique_key_fields) ~ "as unique_id, \n" ~

diff --git a/models/staging/hubspot_staging/src_rag_hubspot.yml b/models/staging/hubspot_staging/src_rag_hubspot.yml
@@ -6,6 +6,41 @@ sources:
     loaded_at_field: _fivetran_synced
 
     tables:
+      - name: engagement
+        identifier: "{{ var('rag_hubspot_engagement_identifier', 'engagement')}}"
+        description: Each record represents an engagement
+        config:
+          enabled: "{{ var('rag_hubspot_sales_enabled', true) and var('rag_hubspot_engagement_enabled', true) }}"
+        columns:
+          - name: _fivetran_synced
+            description: '{{ doc("_fivetran_synced") }}'
+          - name: active
+            description: >
+              Whether the engagement is currently being shown in the UI.
+
+              PLEASE NOTE: This field will not be populated for connectors utilizing the HubSpot v3 API version. This field will be deprecated in a future release.
+          - name: created_at
+            description: >
+              A timestamp representing when the engagement was created.
+
+              PLEASE NOTE: This field will not be populated for connectors utilizing the HubSpot v3 API version. This field will be deprecated in a future release.
+          - name: id
+            description: The ID of the engagement.
+          - name: owner_id
+            description: >
+              The ID of the engagement's owner.
+
+              PLEASE NOTE: This field will not be populated for connectors utilizing the HubSpot v3 API version. This field will be deprecated in a future release.
+          - name: portal_id
+            description: '{{ doc("portal_id") }}'
+          - name: timestamp
+            description: >
+              A timestamp in representing the time that the engagement should appear in the timeline.
+
+              PLEASE NOTE: This field will not be populated for connectors utilizing the HubSpot v3 API version. This field will be deprecated in a future release.
+          - name: type
+            description: One of NOTE, EMAIL, TASK, MEETING, or CALL, the type of the engagement.
+
       - name: engagement_note
         identifier: "{{ var('rag_hubspot_engagement_note_identifier', 'engagement_note')}}"
         description: Each record represents a NOTE engagement event.

diff --git a/models/staging/hubspot_staging/stg_rag_hubspot.yml b/models/staging/hubspot_staging/stg_rag_hubspot.yml
@@ -96,6 +96,39 @@ models:
       - name: source_relation
         description: The source of the record if the unioning functionality is being used. If it is not this field will be empty.
 
+  - name: stg_rag_hubspot__engagement
+    description: Each record represents an engagement 
+    columns:
+      - name: engagement_id
+        description: The ID of the engagement.
+        tests:
+          - not_null
+          - unique
+      - name: engagement_type
+        description: One of NOTE, EMAIL, TASK, MEETING, or CALL, the type of the engagement.
+      - name: is_active
+        description: >
+          Whether the engagement is currently being shown in the UI.
+
+          PLEASE NOTE - This field will only be populated for pre HubSpot v3 API versions. This field is only included to allow for backwards compatibility between HubSpot API versions. This field will be deprecated in the near future.
+      - name: occurred_timestamp
+        description: >
+          A timestamp in representing the time that the engagement should appear in the timeline.
+
+          PLEASE NOTE - This field will only be populated for pre HubSpot v3 API versions. This field is only included to allow for backwards compatibility between HubSpot API versions. This field will be deprecated in the near future.
+      - name: created_timestamp
+        description: >
+          This field marks the call's time of creation and determines where the call sits on the record timeline. You can use either a Unix timestamp in milliseconds or UTC format. 
+
+          PLEASE NOTE: This field will only be populated for pre HubSpot v3 API versions. This field is only included to allow for backwards compatibility between HubSpot API versions. This field will be deprecated in the near future.
+      - name: owner_id
+        description: >
+          The ID of the engagement's owner.
+
+          PLEASE NOTE - This field will only be populated for pre HubSpot v3 API versions. This field is only included to allow for backwards compatibility between HubSpot API versions. This field will be deprecated in the near future.
+      - name: portal_id
+        description: '{{ doc("portal_id") }}'
+
   - name: stg_rag_hubspot__engagement_company
     description: Each record represents a 'link' between a company and an engagement.
     columns:

diff --git a/models/staging/hubspot_staging/stg_rag_hubspot__engagement.sql b/models/staging/hubspot_staging/stg_rag_hubspot__engagement.sql
@@ -0,0 +1,51 @@
+{{ config(enabled=var('rag__using_hubspot', True)) }}
+
+with base as (
+
+    {{
+        fivetran_utils.union_data(
+            table_identifier='engagement', 
+            database_variable='rag_hubspot_database', 
+            schema_variable='rag_hubspot_schema', 
+            default_database=target.database,
+            default_schema='rag_hubspot',
+            default_variable='hubspot_engagement',
+            union_schema_variable='rag_hubspot_union_schemas',
+            union_database_variable='rag_hubspot_union_databases'
+        )
+    }}
+),
+
+fields as (
+
+    select 
+        {{
+            fivetran_utils.fill_staging_columns(
+                source_columns=adapter.get_columns_in_relation(source('rag_hubspot','engagement')),
+                staging_columns=get_hubspot_engagement_columns()
+            )
+        }}
+
+        {{ fivetran_utils.source_relation(
+            union_schema_variable='rag_hubspot_union_schemas', 
+            union_database_variable='rag_hubspot_union_databases') 
+        }}
+    from base
+),
+
+final as (
+
+    select
+        id as engagement_id,
+        created_timestamp,
+        occurred_timestamp,
+        owner_id,
+        source_relation,
+        portal_id,
+        engagement_type,
+        is_active
+    from fields  
+)  
+
+select *
+from final
diff --git a/models/unified_rag.yml b/models/unified_rag.yml
@@ -5,7 +5,7 @@ models:
     description: Each record represents a chunk of text prepared for semantic-search and additional fields for use in LLM workflows.
     columns:
       - name: unique_id
-        description: Unique identifier of the table represented as a combination of document_id, platform, and source_relation fields.
+        description: Unique identifier of the table represented as a combination of document_id, platform, chunk_index, and source_relation fields.
         tests:
           - unique
           - not_null