-
Notifications
You must be signed in to change notification settings - Fork 820
147 lines (138 loc) · 6.02 KB
/
ingest-test-fixtures-update-pr.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
name: Ingest Test Fixtures Update PR
on:
workflow_dispatch:
env:
PYTHON_VERSION: "3.10"
permissions:
id-token: write
contents: read
jobs:
setup:
runs-on: ubuntu-latest-m
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/base-cache
with:
python-version: ${{ env.PYTHON_VERSION }}
setup_ingest:
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup]
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/base-ingest-cache
with:
python-version: ${{ env.PYTHON_VERSION }}
check-only: 'true'
update-fixtures-and-pr:
runs-on: ubuntu-latest-m
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup_ingest]
steps:
# actions/checkout MUST come before auth
- uses: 'actions/checkout@v4'
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Get full Python version
id: full-python-version
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
- name: Setup virtual environment
uses: ./.github/actions/base-ingest-cache
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Setup docker-compose
uses: KengoTODA/actions-setup-docker-compose@v1
with:
version: '2.22.0'
- name: Update test fixtures
env:
AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }}
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }}
CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }}
CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }}
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
HUBSPOT_API_TOKEN: ${{ secrets.HUBSPOT_API_TOKEN }}
JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }}
JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }}
MONGODB_URI: ${{ secrets.MONGODB_URI }}
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }}
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}}
SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}}
SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}}
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}}
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}}
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }}
TABLE_OCR: "tesseract"
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
OVERWRITE_FIXTURES: "true"
CI: "true"
run: |
source .venv/bin/activate
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr-kor
tesseract --version
./test_unstructured_ingest/test-ingest-src.sh
- name: Save branch name to environment file
id: branch
run: |
original_branch=$(git rev-parse --abbrev-ref HEAD)
suffix="|ingest-test-fixtures-update-$(git rev-parse --short HEAD)"
branch_name="$original_branch$suffix"
echo "BRANCH_NAME=$branch_name" >> $GITHUB_ENV
- name: Save PR name to environment file
id: pr
run: |
commit_sha=$(git rev-parse HEAD)
prs=$(curl -s -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
"https://api.github.com/repos/${{ github.repository }}/commits/${commit_sha}/pulls")
pr_name=$(echo "$prs" | jq -r '.[0].title')
echo "PR_NAME=$pr_name" >> $GITHUB_ENV
- name: Create Pull Request
uses: peter-evans/create-pull-request@v5
with:
token: ${{ secrets.GH_CREATE_PR_TOKEN }}
add-paths: |
test_unstructured_ingest/expected-structured-output
test_unstructured_ingest/metrics
commit-message: "Update ingest test fixtures"
branch: ${{ env.BRANCH_NAME }}
title: "${{ env.PR_NAME }} <- Ingest test fixtures update"
assignees: ${{ github.actor }}
reviewers: ${{ github.actor }}
delete-branch: true
body: |
This pull request includes updated ingest test fixtures.
Please review and merge if appropriate.
base: ${{ github.head_ref }}