Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
rimi-itk committed Nov 20, 2024
0 parents commit 9f9fb95
Show file tree
Hide file tree
Showing 14 changed files with 755 additions and 0 deletions.
45 changes: 45 additions & 0 deletions .github/workflows/pr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
on:
push:
branches:
- 'ai-stuff'
pull_request:
branches:
- 'ai-stuff'

name: Review

jobs:
changelog:
runs-on: ubuntu-latest
name: Changelog should be updated
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 2

- name: Git fetch
run: git fetch

- name: Check that changelog has been updated.
run: git diff --exit-code origin/${{ github.base_ref }} -- CHANGELOG.md && exit 1 || exit 0

coding-standards-markdown:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Coding standards
run: |
docker run --rm --volume "$PWD:/md" peterdavehello/markdownlint markdownlint '**/*.md'
coding-standards-shellcheck:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Coding standards
run: |
docker run --rm --volume "$PWD:/mnt" koalaman/shellcheck:stable */*.sh
18 changes: 18 additions & 0 deletions .markdownlint.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"default": true,
// https://github.com/DavidAnson/markdownlint/blob/main/doc/md013.md
"line-length": {
"line_length": 120,
"code_blocks": false,
"tables": false
},
// https://github.com/DavidAnson/markdownlint/blob/main/doc/md024.md
"no-duplicate-heading": {
"siblings_only": true
},
// https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/organizing-information-with-collapsed-sections#creating-a-collapsed-section
// https://github.com/DavidAnson/markdownlint/blob/main/doc/md033.md
"no-inline-html": {
"allowed_elements": ["details", "summary"]
}
}
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Changelog

All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Added

- Data export script

[Unreleased]: https://github.com/itk-dev/os2loop/tree/ai-stuff
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# OS2Loop AI stuff

``` shell
git clone --branch ai-stuff https://github.com/itk-dev/os2loop os2loop-ai-stuff

# Run a script, e.g.
./os2loop-ai-stuff/data-export/export.sh
```

## Development

``` shell
task
```
33 changes: 33 additions & 0 deletions Taskfile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
version: '3'

tasks:
default:
cmds:
- task --list
silent: true

coding-standards:check:
desc: "Apply coding standards and run checks"
cmds:
- task: coding-standards:apply
- task: coding-standards:check:shellcheck

coding-standards:apply:
desc: "Apply coding standards"
cmds:
- task: coding-standards:apply:markdownlint

coding-standards:apply:markdownlint:
desc: "Run markdownlint-cli (https://github.com/igorshubovych/markdownlint-cli)"
cmds:
- docker run --rm --volume "$PWD:/md" peterdavehello/markdownlint markdownlint '**/*.md' --fix

coding-standards:check:markdownlint:
desc: "Run markdownlint-cli (https://github.com/igorshubovych/markdownlint-cli)"
cmds:
- docker run --rm --volume "$PWD:/md" peterdavehello/markdownlint markdownlint '**/*.md'

coding-standards:check:shellcheck:
desc: "Run ShellCheck (https://github.com/koalaman/shellcheck)"
cmds:
- docker run --rm --volume "$PWD:/mnt" koalaman/shellcheck:stable */*.sh
2 changes: 2 additions & 0 deletions data-export/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.csv
*.json
74 changes: 74 additions & 0 deletions data-export/export.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env bash
set -o errexit -o errtrace -o noclobber -o nounset -o pipefail
IFS=$'\n\t'

script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)

function usage() {
if [ -n "${1:-}" ]; then
>&2 cat <<EOF
$1
EOF
fi

>&2 cat <<EOF
Usage: ${BASH_SOURCE[0]} project-dir site-uri
EOF
exit 1
}

if (( $# < 2 )); then
usage "Too few arguments"
fi

project_dir="$1"
uri="$2"

if [ -z "$project_dir" ]; then
usage "Invalid project directory"
fi

if [ ! -d "$project_dir" ] ; then
(>&2 echo 'Project directory "'"$project_dir"'" does not exist')
exit 1
fi

if [ -z "$uri" ]; then
usage "Invalid site-uri"
fi

cd "$project_dir"

filenames=("$script_dir"/export_*.sql)

for filename in "${filenames[@]}"; do
echo "$filename"

# JSON

# https://tldp.org/LDP/abs/html/string-manipulation.html
output_filename=${filename/%.sql/.json}
# https://github.com/drush-ops/drush/issues/3071#issuecomment-347929777
vendor/bin/drush --uri="$uri" php:eval "return \Drupal::database()->query(file_get_contents('$filename'))->fetchAll()" --format=json >| "$output_filename" || true
echo "$output_filename"

# CSV

output_filename=${filename/%.sql/.csv}
# https://stackoverflow.com/a/22421445/2502647
vendor/bin/drush --uri="$uri" sql:cli < "$filename" | awk 'BEGIN { FS="\t"; OFS="," } {
rebuilt=0
for(i=1; i<=NF; ++i) {
if ($i ~ /,/ && $i !~ /^".*"$/) {
gsub("\"", "\"\"", $i)
$i = "\"" $i "\""
rebuilt=1
}
}
if (!rebuilt) { $1=$1 }
print
}' >| "$output_filename" || true
echo "$output_filename"
done
63 changes: 63 additions & 0 deletions data-export/export_document_collections_with_meta.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
Select
n_fd.nid,
n_fd.type,
n_fd.title,
pa.`alias` as relative_url,
DATE_FORMAT(FROM_UNIXTIME(n_fd.created), '%Y-%m-%dT%H:%i:%s') as created,
DATE_FORMAT(FROM_UNIXTIME(n_fd.changed), '%Y-%m-%dT%H:%i:%s') as `changed`,
dci.document_node_ids,
dci.document_relative_urls,
docscol_content.os2loop_documents_dc_content_value as content, -- all format are rich text (html and div encoded)
docs_ib.os2loop_documents_info_box_value as info_box,
approval_date.os2loop_shared_approval_date_value as approval_date,
`subject`.`name` as `subject`,
tags.tags,
`owner`.os2loop_shared_owner_value as `owner`,
rev_date.os2loop_shared_rev_date_value as review_date,
`version`.os2loop_shared_version_value as `version`
from (
SELECT nid,vid,type,uid,title,created,changed
FROM node_field_data
where type = 'os2loop_documents_collection'
-- the table os2loop_documents_collection_item associate document collections (their nid on collection_id)
-- to documents (document_id = nid) except for 20 document collections. Fx case /rammedelegation nid=3807 it is a collection of
-- links to sharepoint docs and /medicinhaandtering nid=3827 is a link to the collection
-- /instruks-korrekt-haandtering-af-medicin-i-sundhed-og-omsorg-mso nid 4188
-- of 805 documents 164 documents are not assigned to a document_collection
) as n_fd
left join path_alias as pa on CONCAT('/node/',n_fd.nid) = pa.path
left join (
SELECT
doc_col_itm.collection_id,
json_arrayagg(doc_col_itm.document_id) as document_node_ids,
json_arrayagg(pa.`alias`) as document_relative_urls
from os2loop_documents_collection_item as doc_col_itm
left join path_alias as pa on CONCAT('/node/',doc_col_itm.document_id) = pa.path
group by doc_col_itm.collection_id
) as dci on n_fd.nid = dci.collection_id
left join node__os2loop_documents_dc_content as docscol_content on n_fd.nid = docscol_content.entity_id -- contains only records from bundle documents_collection (all delta 0, so top placement)
left join (
SELECT
entity_id,
os2loop_documents_info_box_value
FROM node__os2loop_documents_info_box
WHERE bundle = 'os2loop_documents_collection') as docs_ib on n_fd.nid = docs_ib.entity_id -- only from bundle document_collection
left join node__os2loop_shared_approval_date as approval_date on n_fd.nid = approval_date.entity_id
left join (
SELECT
n_ss.entity_id,
subject_tt_fd.name
FROM node__os2loop_shared_subject as n_ss
left join taxonomy_term_field_data as subject_tt_fd on n_ss.os2loop_shared_subject_target_id = subject_tt_fd.tid
where n_ss.bundle = 'os2loop_documents_collection') as `subject` on n_fd.nid = `subject`.entity_id
left join node__os2loop_shared_owner as `owner` on n_fd.nid = `owner`.entity_id
left join node__os2loop_shared_rev_date as rev_date on n_fd.nid = rev_date.entity_id
left join (
SELECT
n_st.entity_id,
json_arrayagg(tt_fd.name) as tags
FROM node__os2loop_shared_tags as n_st
left join taxonomy_term_field_data as tt_fd on n_st.os2loop_shared_tags_target_id = tt_fd.tid
where n_st.bundle = 'os2loop_documents_collection'
group by n_st.entity_id) tags on n_fd.nid = tags.entity_id
left join node__os2loop_shared_version as `version` on n_fd.nid = `version`.entity_id
Loading

0 comments on commit 9f9fb95

Please sign in to comment.