From 87843f52fca09ab84c72c6cebed4eeb750826d32 Mon Sep 17 00:00:00 2001 From: Mikkel Ricky Date: Wed, 20 Nov 2024 21:37:26 +0100 Subject: [PATCH] First commit --- .github/workflows/pr.yml | 45 +++++ .markdownlint.jsonc | 18 ++ CHANGELOG.md | 14 ++ README.md | 17 ++ Taskfile.yml | 33 ++++ data-export/.gitignore | 2 + data-export/export.sh | 74 +++++++++ .../export_document_collections_with_meta.sql | 63 +++++++ data-export/export_documents_with_meta.sql | 155 ++++++++++++++++++ data-export/export_external_with_meta.sql | 41 +++++ .../export_folding_parts_of_documents.sql | 54 ++++++ data-export/export_news_posts_with_meta.sql | 36 ++++ data-export/export_q_and_a_with_meta.sql | 139 ++++++++++++++++ data-export/export_static_pages_with_meta.sql | 67 ++++++++ 14 files changed, 758 insertions(+) create mode 100644 .github/workflows/pr.yml create mode 100644 .markdownlint.jsonc create mode 100644 CHANGELOG.md create mode 100644 README.md create mode 100644 Taskfile.yml create mode 100644 data-export/.gitignore create mode 100755 data-export/export.sh create mode 100644 data-export/export_document_collections_with_meta.sql create mode 100644 data-export/export_documents_with_meta.sql create mode 100644 data-export/export_external_with_meta.sql create mode 100644 data-export/export_folding_parts_of_documents.sql create mode 100644 data-export/export_news_posts_with_meta.sql create mode 100644 data-export/export_q_and_a_with_meta.sql create mode 100644 data-export/export_static_pages_with_meta.sql diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml new file mode 100644 index 000000000..8fb50fc27 --- /dev/null +++ b/.github/workflows/pr.yml @@ -0,0 +1,45 @@ +on: + push: + branches: + - 'ai-stuff' + pull_request: + branches: + - 'ai-stuff' + +name: Review + +jobs: + changelog: + runs-on: ubuntu-latest + name: Changelog should be updated + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 2 + + - name: Git fetch + run: git fetch + + - name: Check that changelog has been updated. + run: git diff --exit-code origin/${{ github.base_ref }} -- CHANGELOG.md && exit 1 || exit 0 + + coding-standards-markdown: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Coding standards + run: | + docker run --rm --volume "$PWD:/md" peterdavehello/markdownlint markdownlint '**/*.md' + + coding-standards-shellcheck: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Coding standards + run: | + docker run --rm --volume "$PWD:/mnt" koalaman/shellcheck:stable */*.sh diff --git a/.markdownlint.jsonc b/.markdownlint.jsonc new file mode 100644 index 000000000..86516e72b --- /dev/null +++ b/.markdownlint.jsonc @@ -0,0 +1,18 @@ +{ + "default": true, + // https://github.com/DavidAnson/markdownlint/blob/main/doc/md013.md + "line-length": { + "line_length": 120, + "code_blocks": false, + "tables": false + }, + // https://github.com/DavidAnson/markdownlint/blob/main/doc/md024.md + "no-duplicate-heading": { + "siblings_only": true + }, + // https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/organizing-information-with-collapsed-sections#creating-a-collapsed-section + // https://github.com/DavidAnson/markdownlint/blob/main/doc/md033.md + "no-inline-html": { + "allowed_elements": ["details", "summary"] + } +} diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..957df01c4 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,14 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- Data export script + +[Unreleased]: https://github.com/itk-dev/os2loop/tree/ai-stuff diff --git a/README.md b/README.md new file mode 100644 index 000000000..92d14f3ce --- /dev/null +++ b/README.md @@ -0,0 +1,17 @@ +# OS2Loop AI stuff + +``` shell +git clone --branch ai-stuff https://github.com/itk-dev/os2loop os2loop-ai-stuff +``` + +## Scripts + +``` shell +./os2loop-ai-stuff/data-export/export.sh +``` + +## Development + +``` shell +task +``` diff --git a/Taskfile.yml b/Taskfile.yml new file mode 100644 index 000000000..86897133a --- /dev/null +++ b/Taskfile.yml @@ -0,0 +1,33 @@ +version: '3' + +tasks: + default: + cmds: + - task --list + silent: true + + coding-standards:check: + desc: "Apply coding standards and run checks" + cmds: + - task: coding-standards:apply + - task: coding-standards:check:shellcheck + + coding-standards:apply: + desc: "Apply coding standards" + cmds: + - task: coding-standards:apply:markdownlint + + coding-standards:apply:markdownlint: + desc: "Run markdownlint-cli (https://github.com/igorshubovych/markdownlint-cli)" + cmds: + - docker run --rm --volume "$PWD:/md" peterdavehello/markdownlint markdownlint '**/*.md' --fix + + coding-standards:check:markdownlint: + desc: "Run markdownlint-cli (https://github.com/igorshubovych/markdownlint-cli)" + cmds: + - docker run --rm --volume "$PWD:/md" peterdavehello/markdownlint markdownlint '**/*.md' + + coding-standards:check:shellcheck: + desc: "Run ShellCheck (https://github.com/koalaman/shellcheck)" + cmds: + - docker run --rm --volume "$PWD:/mnt" koalaman/shellcheck:stable */*.sh diff --git a/data-export/.gitignore b/data-export/.gitignore new file mode 100644 index 000000000..343a25cb8 --- /dev/null +++ b/data-export/.gitignore @@ -0,0 +1,2 @@ +*.csv +*.json diff --git a/data-export/export.sh b/data-export/export.sh new file mode 100755 index 000000000..d4532fd1c --- /dev/null +++ b/data-export/export.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +set -o errexit -o errtrace -o noclobber -o nounset -o pipefail +IFS=$'\n\t' + +script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + +function usage() { + if [ -n "${1:-}" ]; then + >&2 cat <&2 cat <&2 echo 'Project directory "'"$project_dir"'" does not exist') + exit 1 +fi + +if [ -z "$uri" ]; then + usage "Invalid site-uri" +fi + +cd "$project_dir" + +filenames=("$script_dir"/export_*.sql) + +for filename in "${filenames[@]}"; do + echo "$filename" + + # JSON + + # https://tldp.org/LDP/abs/html/string-manipulation.html + output_filename=${filename/%.sql/.json} + # https://github.com/drush-ops/drush/issues/3071#issuecomment-347929777 + vendor/bin/drush --uri="$uri" php:eval "return \Drupal::database()->query(file_get_contents('$filename'))->fetchAll()" --format=json >| "$output_filename" || true + echo "$output_filename" + + # CSV + + output_filename=${filename/%.sql/.csv} + # https://stackoverflow.com/a/22421445/2502647 + vendor/bin/drush --uri="$uri" sql:cli < "$filename" | awk 'BEGIN { FS="\t"; OFS="," } { + rebuilt=0 + for(i=1; i<=NF; ++i) { + if ($i ~ /,/ && $i !~ /^".*"$/) { + gsub("\"", "\"\"", $i) + $i = "\"" $i "\"" + rebuilt=1 + } + } + if (!rebuilt) { $1=$1 } + print +}' >| "$output_filename" || true + echo "$output_filename" +done diff --git a/data-export/export_document_collections_with_meta.sql b/data-export/export_document_collections_with_meta.sql new file mode 100644 index 000000000..37fa64579 --- /dev/null +++ b/data-export/export_document_collections_with_meta.sql @@ -0,0 +1,63 @@ +Select + n_fd.nid, + n_fd.type, + n_fd.title, + pa.`alias` as relative_url, + DATE_FORMAT(FROM_UNIXTIME(n_fd.created), '%Y-%m-%dT%H:%i:%s') as created, + DATE_FORMAT(FROM_UNIXTIME(n_fd.changed), '%Y-%m-%dT%H:%i:%s') as `changed`, + dci.document_node_ids, + dci.document_relative_urls, + docscol_content.os2loop_documents_dc_content_value as content, -- all format are rich text (html and div encoded) + docs_ib.os2loop_documents_info_box_value as info_box, + approval_date.os2loop_shared_approval_date_value as approval_date, + `subject`.`name` as `subject`, + tags.tags, + `owner`.os2loop_shared_owner_value as `owner`, + rev_date.os2loop_shared_rev_date_value as review_date, + `version`.os2loop_shared_version_value as `version` +from ( + SELECT nid,vid,type,uid,title,created,changed + FROM node_field_data + where type = 'os2loop_documents_collection' + -- the table os2loop_documents_collection_item associate document collections (their nid on collection_id) + -- to documents (document_id = nid) except for 20 document collections. Fx case /rammedelegation nid=3807 it is a collection of + -- links to sharepoint docs and /medicinhaandtering nid=3827 is a link to the collection + -- /instruks-korrekt-haandtering-af-medicin-i-sundhed-og-omsorg-mso nid 4188 + -- of 805 documents 164 documents are not assigned to a document_collection + ) as n_fd +left join path_alias as pa on CONCAT('/node/',n_fd.nid) = pa.path +left join ( + SELECT + doc_col_itm.collection_id, + json_arrayagg(doc_col_itm.document_id) as document_node_ids, + json_arrayagg(pa.`alias`) as document_relative_urls + from os2loop_documents_collection_item as doc_col_itm + left join path_alias as pa on CONCAT('/node/',doc_col_itm.document_id) = pa.path + group by doc_col_itm.collection_id + ) as dci on n_fd.nid = dci.collection_id +left join node__os2loop_documents_dc_content as docscol_content on n_fd.nid = docscol_content.entity_id -- contains only records from bundle documents_collection (all delta 0, so top placement) +left join ( + SELECT + entity_id, + os2loop_documents_info_box_value + FROM node__os2loop_documents_info_box + WHERE bundle = 'os2loop_documents_collection') as docs_ib on n_fd.nid = docs_ib.entity_id -- only from bundle document_collection +left join node__os2loop_shared_approval_date as approval_date on n_fd.nid = approval_date.entity_id +left join ( + SELECT + n_ss.entity_id, + subject_tt_fd.name + FROM node__os2loop_shared_subject as n_ss + left join taxonomy_term_field_data as subject_tt_fd on n_ss.os2loop_shared_subject_target_id = subject_tt_fd.tid + where n_ss.bundle = 'os2loop_documents_collection') as `subject` on n_fd.nid = `subject`.entity_id +left join node__os2loop_shared_owner as `owner` on n_fd.nid = `owner`.entity_id +left join node__os2loop_shared_rev_date as rev_date on n_fd.nid = rev_date.entity_id +left join ( + SELECT + n_st.entity_id, + json_arrayagg(tt_fd.name) as tags + FROM node__os2loop_shared_tags as n_st + left join taxonomy_term_field_data as tt_fd on n_st.os2loop_shared_tags_target_id = tt_fd.tid + where n_st.bundle = 'os2loop_documents_collection' + group by n_st.entity_id) tags on n_fd.nid = tags.entity_id +left join node__os2loop_shared_version as `version` on n_fd.nid = `version`.entity_id diff --git a/data-export/export_documents_with_meta.sql b/data-export/export_documents_with_meta.sql new file mode 100644 index 000000000..16811ee65 --- /dev/null +++ b/data-export/export_documents_with_meta.sql @@ -0,0 +1,155 @@ +Select + n_fd.nid, + -- n_fd.type, + n_fd.title, + pa.`alias` as relative_url, + DATE_FORMAT(FROM_UNIXTIME(n_fd.created), '%Y-%m-%dT%H:%i:%s') as created, + DATE_FORMAT(FROM_UNIXTIME(n_fd.changed), '%Y-%m-%dT%H:%i:%s') as `changed`, + doc_author.os2loop_documents_document_autho_value as document_author, -- is this okay to share, here we name person, not just orgs + doc_body.os2loop_documents_document_body_value as document_body, -- all delta is zero + doc_content.content, -- when 'content_type' is os2loop_documents_step_by_step the JSON object should be replaced by the corresponding folding_part entity depending on the content_target_id, otherwise the field can be removed + `subject`.`name` as `subject`, + tags.tags, + approval_date.os2loop_shared_approval_date_value as approval_date, + rev_date.os2loop_shared_rev_date_value as review_date, + `version`.os2loop_shared_version_value as `version` +from ( + SELECT nid,vid,type,uid,title,created,changed + FROM node_field_data + where `type` = 'os2loop_documents_document' + ) as n_fd +left join path_alias as pa on CONCAT('/node/',n_fd.nid) = pa.path +left join node__os2loop_documents_document_autho as doc_author on n_fd.nid = doc_author.entity_id +left join node__os2loop_documents_document_body as doc_body on n_fd.nid = doc_body.entity_id +-- group by n_fd.nid -- returns 701 records +left join ( + SELECT + n_ddc.entity_id, + -- ANY_VALUE(para_ifd.parent_field_name) as parent_field_name, + json_arrayagg(json_object( + 'delta', n_ddc.delta, + 'paragraph_title', doc_title.os2loop_documents_title_value, -- all delta is zero + 'paragraph_description', IFNull(doc_desc.os2loop_documents_description_value, IFNULL(tbl.os2loop_documents_tbl_desc_value, video.os2loop_video_description_value)), + 'paragraph', IFNull(doc_hc.os2loop_documents_hc_content_value, IFNull(text_and_image.os2loop_documents_tai_text_value, tbl.os2loop_documents_tbl_cont_value)), + 'content', json_object('content_target_id', n_ddc.os2loop_documents_document_conte_target_id, + 'content_type', para_ifd.`type` + ), -- when 'content_type' is os2loop_documents_step_by_step the JSON object should be replaced by the corresponding folding_part entity depending on the content_target_id, otherwise the field can be removed + 'text_and_image_position', text_and_image.os2loop_documents_tai_position_value, + 'media', json_object('uri', IFNull(text_and_image.uri, video.os2loop_video_url_uri), + 'filemime', text_and_image.filemime, + 'filesize', text_and_image.filesize, + 'created', DATE_FORMAT(FROM_UNIXTIME(text_and_image.created), '%Y-%m-%dT%H:%i:%s'), + 'changed', DATE_FORMAT(FROM_UNIXTIME(text_and_image.`changed`), '%Y-%m-%dT%H:%i:%s'), + 'video_title', video.os2loop_video_title_value, + 'video_iframe', video.os2loop_video_iframe_value + ) + )) as content + FROM node__os2loop_documents_document_conte as n_ddc + left join paragraphs_item_field_data as para_ifd on n_ddc.os2loop_documents_document_conte_target_id = para_ifd.id + left join paragraph__os2loop_documents_hc_content as doc_hc on n_ddc.os2loop_documents_document_conte_target_id = doc_hc.entity_id + left join ( + WITH media_attachments AS (SELECT + media_collected.mid, + fm.uri, -- where public:// -> https://loop.sundhedogomsorg.dk/sites/loop.sundhedogomsorg.dk/files/ + fm.filemime, + fm.filesize, + fm.created, + fm.changed + FROM (SELECT + m_fd.mid, + IFNULL (m_mf.field_media_file_target_id, m_mi.field_media_image_target_id) as target_id + from media_field_data as m_fd + left join media__field_media_file as m_mf on m_fd.mid = m_mf.entity_id -- remember dep on m_fd.bundle whether it should be media__field_media_file, media__field_media_image or media__field_media_library we need to join on + left join media__field_media_image as m_mi on m_fd.mid = m_mi.entity_id + -- left join media__field_media_library as m_ml on m_fd.mid = m_ml.entity_id -- turns out the media_library is not the correct reference for the file_managed table + ) as media_collected + left join file_managed as fm on media_collected.target_id = fm.fid) + SELECT + tai_pos.bundle, + tai_pos.deleted, + tai_pos.entity_id, + tai_pos.revision_id, + tai_pos.langcode, + tai_pos.delta, + tai_pos.os2loop_documents_tai_position_value, + tai_text.os2loop_documents_tai_text_value, + tai_text.os2loop_documents_tai_text_format, + tai_image.os2loop_documents_tai_image_target_id, + media_attachments.uri, + media_attachments.filemime, + media_attachments.filesize, + media_attachments.created, + media_attachments.`changed` + FROM paragraph__os2loop_documents_tai_position as tai_pos + left join paragraph__os2loop_documents_tai_text as tai_text on tai_pos.entity_id = tai_text.entity_id + -- where tai_pos.bundle != tai_text.bundle -- none + -- where tai_pos.entity_id != tai_text.entity_id -- none + -- where tai_pos.delta != tai_text.delta -- none + left join paragraph__os2loop_documents_tai_image as tai_image on tai_pos.entity_id = tai_image.entity_id + -- where tai_pos.bundle != tai_image.bundle -- none + -- where tai_pos.entity_id != tai_image.entity_id -- none + -- where tai_pos.delta != tai_image.delta -- none + -- where !isnull(tai_text.os2loop_documents_tai_text_format) -- 113 records - like numbner of records in tai_text + -- where !isnull(tai_image.os2loop_documents_tai_image_target_id) -- 60 records - like numbner of records in tai_image + left join media_attachments on tai_image.os2loop_documents_tai_image_target_id = media_attachments.mid + ) as text_and_image on n_ddc.os2loop_documents_document_conte_target_id = text_and_image.entity_id + left join ( + SELECT + video_src.bundle, + video_src.entity_id, + video_src.delta, + video_title.os2loop_video_title_value, + video_desc.os2loop_video_description_value, + video_ifrm.os2loop_video_iframe_value, + video_url.os2loop_video_url_uri, + -- video_url.os2loop_video_url_title -- all empty so far + -- Convert(video_url.os2loop_video_url_options, CHAR(255)), -- nothing interesting so far (all contains "a:0:{}") + video_width.os2loop_video_width_value + FROM paragraph__os2loop_video_source_type as video_src -- so far all sources are url + left join paragraph__os2loop_video_description as video_desc on video_src.entity_id = video_desc.entity_id + left join paragraph__os2loop_video_iframe as video_ifrm on video_src.entity_id = video_ifrm.entity_id + left join paragraph__os2loop_video_title as video_title on video_src.entity_id = video_title.entity_id + left join paragraph__os2loop_video_url as video_url on video_src.entity_id = video_url.entity_id + left join paragraph__os2loop_video_width as video_width on video_src.entity_id = video_width.entity_id + ) as video on n_ddc.os2loop_documents_document_conte_target_id = video.entity_id + -- where para_ifd.type = 'os2loop_video' -- returns 10 of 14 video records + left join ( + SELECT + tbl_content.bundle, + tbl_content.entity_id, + tbl_content.delta, + tbl_content.os2loop_documents_tbl_cont_value, + tbl_content.os2loop_documents_tbl_cont_format, + tbl_desc.os2loop_documents_tbl_desc_value + FROM paragraph__os2loop_documents_tbl_cont as tbl_content + left join paragraph__os2loop_documents_tbl_desc as tbl_desc on tbl_content.entity_id = tbl_desc.entity_id + ) as tbl on n_ddc.os2loop_documents_document_conte_target_id = tbl.entity_id + -- where para_ifd.`type` = 'os2loop_documents_table' -- returns 3 of 16 records + left join `paragraph__os2loop_documents_title` as doc_title on n_ddc.os2loop_documents_document_conte_target_id = doc_title.entity_id + left join paragraph__os2loop_documents_description as doc_desc on n_ddc.os2loop_documents_document_conte_target_id = doc_desc.entity_id -- all para doc desc relates to bundle step_by_step + -- we miss content on step by step paragraphs, (we only have description so far) this must be added post querying using the dedicated query export_folding_parts_of_documents after it have been post-processed by the preprocess_loop_document_folding_parts.py + -- we ignore table of contents as this does not add any additional information conveying text + where para_ifd.`type` not in ( + 'table_of_contents' -- this leaves out 3 records + ) + group by n_ddc.entity_id + ) as doc_content on n_fd.nid = doc_content.entity_id +-- where ((not ISNULL(doc_body.os2loop_documents_document_body_value)) and not ISNULL(n_ddc.os2loop_documents_document_conte_target_id)) -- returns 13 records, where 3 (maybe 4) records seems to have both relevant body and content +left join node__os2loop_shared_approval_date as approval_date on n_fd.nid = approval_date.entity_id +left join ( + SELECT + n_ss.entity_id, + subject_tt_fd.name + FROM node__os2loop_shared_subject as n_ss + left join taxonomy_term_field_data as subject_tt_fd on n_ss.os2loop_shared_subject_target_id = subject_tt_fd.tid + where n_ss.bundle = 'os2loop_documents_document') as `subject` on n_fd.nid = `subject`.entity_id +left join ( + SELECT + n_st.entity_id, + json_arrayagg(tt_fd.name) as tags + FROM node__os2loop_shared_tags as n_st + left join taxonomy_term_field_data as tt_fd on n_st.os2loop_shared_tags_target_id = tt_fd.tid + where n_st.bundle = 'os2loop_documents_document' + group by n_st.entity_id) tags on n_fd.nid = tags.entity_id +left join node__os2loop_shared_rev_date as rev_date on n_fd.nid = rev_date.entity_id +left join node__os2loop_shared_version as `version` on n_fd.nid = `version`.entity_id diff --git a/data-export/export_external_with_meta.sql b/data-export/export_external_with_meta.sql new file mode 100644 index 000000000..fdfff75af --- /dev/null +++ b/data-export/export_external_with_meta.sql @@ -0,0 +1,41 @@ +Select + n_fd.nid, + n_fd.type, + n_fd.title, + pa.`alias` as relative_url, + DATE_FORMAT(FROM_UNIXTIME(n_fd.created), '%Y-%m-%dT%H:%i:%s') as created, + DATE_FORMAT(FROM_UNIXTIME(n_fd.changed), '%Y-%m-%dT%H:%i:%s') as `changed`, + -- doc_author.os2loop_documents_document_autho_value, -- not relevant for external + -- doc_body.os2loop_documents_document_body_value -- not relevant for external + ext_desc.os2loop_external_descripti_value as `description`, + ext_url.os2loop_external_url_title as url_title_text, + ext_url.os2loop_external_url_uri as url, + `subject`.`name` as `subject`, + u_jt.os2loop_user_job_title_value as job_title, + tags.tags, + rev_date.os2loop_shared_rev_date_value as review_date +from ( + SELECT nid,vid,type,uid,title,created,changed + FROM node_field_data + where type = 'os2loop_external' -- nodes with links/external references (it seems) + ) as n_fd +left join path_alias as pa on CONCAT('/node/',n_fd.nid) = pa.path +left join user__os2loop_user_job_title as u_jt on n_fd.uid = u_jt.entity_id +left join node__os2loop_external_descripti as ext_desc on n_fd.nid = ext_desc.entity_id +left join node__os2loop_external_url as ext_url on n_fd.nid = ext_url.entity_id +left join ( + SELECT + n_ss.entity_id, + subject_tt_fd.name + FROM node__os2loop_shared_subject as n_ss + left join taxonomy_term_field_data as subject_tt_fd on n_ss.os2loop_shared_subject_target_id = subject_tt_fd.tid + where n_ss.bundle = 'os2loop_external') as `subject` on n_fd.nid = `subject`.entity_id +left join ( + SELECT + n_st.entity_id, + json_arrayagg(tt_fd.name) as tags + FROM node__os2loop_shared_tags as n_st + left join taxonomy_term_field_data as tt_fd on n_st.os2loop_shared_tags_target_id = tt_fd.tid + where n_st.bundle = 'os2loop_external' + group by n_st.entity_id) tags on n_fd.nid = tags.entity_id +left join node__os2loop_shared_rev_date as rev_date on n_fd.nid = rev_date.entity_id diff --git a/data-export/export_folding_parts_of_documents.sql b/data-export/export_folding_parts_of_documents.sql new file mode 100644 index 000000000..d83ddd2a6 --- /dev/null +++ b/data-export/export_folding_parts_of_documents.sql @@ -0,0 +1,54 @@ +SELECT + steps.bundle as step_bundle, + steps.entity_id, + steps.delta, + steps.os2loop_documents_steps_target_id as step_target_id, + step_title.os2loop_documents_step_title_value as step_title, + -- step_text.bundle as text_bundle, + step_text.delta as text_delta, + step_text.os2loop_documents_step_text_value, + -- step_text.os2loop_documents_step_text_format, + -- step_image.bundle as image_bundle, + step_image.delta as image_delta, + step_image.uri, + step_image.filemime, + step_image.filesize, + DATE_FORMAT(FROM_UNIXTIME(step_image.created), '%Y-%m-%dT%H:%i:%s') as created, + DATE_FORMAT(FROM_UNIXTIME(step_image.`changed`), '%Y-%m-%dT%H:%i:%s') as `changed` +from paragraph__os2loop_documents_steps as steps +left join paragraph__os2loop_documents_step_title as step_title on steps.os2loop_documents_steps_target_id = step_title.entity_id -- contains 682 records, but only 617 records are join, so there might be some problem +left join paragraph__os2loop_documents_step_text as step_text on steps.os2loop_documents_steps_target_id = step_text.entity_id -- bundle are only os2loop_documents_step, contains 620 reconds, but only 558 are joined +left join ( + WITH media_attachments AS (SELECT + media_collected.mid, + fm.uri, -- where public:// -> https://loop.sundhedogomsorg.dk/sites/loop.sundhedogomsorg.dk/files/ + fm.filemime, + fm.filesize, + fm.created, + fm.changed + FROM (SELECT + m_fd.mid, + -- IFNULL (m_mf.field_media_file_target_id, IFNULL(m_mi.field_media_image_target_id, m_ml.field_media_library_target_id)) as target_id + IFNULL (m_mf.field_media_file_target_id, m_mi.field_media_image_target_id) as target_id + from media_field_data as m_fd + left join media__field_media_file as m_mf on m_fd.mid = m_mf.entity_id -- remember dep on m_fd.bundle whether it should be media__field_media_file, media__field_media_image or media__field_media_library we need to join on + left join media__field_media_image as m_mi on m_fd.mid = m_mi.entity_id + -- left join media__field_media_library as m_ml on m_fd.mid = m_ml.entity_id + ) as media_collected + left join file_managed as fm on media_collected.target_id = fm.fid) + SELECT + step_image.bundle, + -- step_image.deleted, + step_image.entity_id, + -- step_image.revision_id, + -- step_image.langcode, + step_image.delta, + -- step_image.os2loop_documents_step_image_target_id, + media_attachments.uri, + media_attachments.filemime, + media_attachments.filesize, + media_attachments.created, + media_attachments.`changed` + FROM paragraph__os2loop_documents_step_image as step_image + left join media_attachments on step_image.os2loop_documents_step_image_target_id = media_attachments.mid + ) as step_image on steps.os2loop_documents_steps_target_id = step_image.entity_id diff --git a/data-export/export_news_posts_with_meta.sql b/data-export/export_news_posts_with_meta.sql new file mode 100644 index 000000000..8b84c43b1 --- /dev/null +++ b/data-export/export_news_posts_with_meta.sql @@ -0,0 +1,36 @@ +Select + n_fd.nid, + n_fd.type, + n_fd.title, + pa.`alias` as relative_url, + DATE_FORMAT(FROM_UNIXTIME(n_fd.created), '%Y-%m-%dT%H:%i:%s') as created, + DATE_FORMAT(FROM_UNIXTIME(n_fd.changed), '%Y-%m-%dT%H:%i:%s') as `changed`, + -- doc_author.os2loop_documents_document_autho_value, -- is this okay to share, here we name person, not just orgs + -- doc_body.os2loop_documents_document_body_value, -- all delta is zero + post_cont.os2loop_post_content_value as `content`, + `subject`.`name` as `subject`, + tags.tags +from ( + SELECT nid,vid,type,uid,title,created,changed + FROM node_field_data + where type = 'os2loop_post' -- nyheder/indlæg + ) as n_fd +left join path_alias as pa on CONCAT('/node/',n_fd.nid) = pa.path +-- left join node__os2loop_documents_document_autho as doc_author on n_fd.nid = doc_author.entity_id +-- left join node__os2loop_documents_document_body as doc_body on n_fd.nid = doc_body.entity_id +left join node__os2loop_post_content as post_cont on n_fd.nid = post_cont.entity_id +left join ( + SELECT + n_ss.entity_id, + subject_tt_fd.name + FROM node__os2loop_shared_subject as n_ss + left join taxonomy_term_field_data as subject_tt_fd on n_ss.os2loop_shared_subject_target_id = subject_tt_fd.tid + where n_ss.bundle = 'os2loop_post') as `subject` on n_fd.nid = `subject`.entity_id +left join ( + SELECT + n_st.entity_id, + json_arrayagg(tt_fd.name) as tags + FROM node__os2loop_shared_tags as n_st + left join taxonomy_term_field_data as tt_fd on n_st.os2loop_shared_tags_target_id = tt_fd.tid + where n_st.bundle = 'os2loop_post' + group by n_st.entity_id) tags on n_fd.nid = tags.entity_id; diff --git a/data-export/export_q_and_a_with_meta.sql b/data-export/export_q_and_a_with_meta.sql new file mode 100644 index 000000000..1a4168b31 --- /dev/null +++ b/data-export/export_q_and_a_with_meta.sql @@ -0,0 +1,139 @@ +WITH media_attachments AS (SELECT + media_collected.mid, + fm.uri, -- where public:// -> https://loop.sundhedogomsorg.dk/sites/loop.sundhedogomsorg.dk/files/ + fm.filemime, + fm.filesize, + fm.created, + fm.changed + FROM (SELECT + m_fd.mid, + -- m_fd.vid, + -- -- m_fd.bundle, + -- -- m_fd.langcode, + -- m_fd.status, + -- m_fd.uid, + -- m_fd.name, + -- m_fd.thumbnail__target_id, + -- m_fd.thumbnail__alt, + -- m_fd.thumbnail__title, + -- m_fd.thumbnail__width, + -- m_fd.thumbnail__height, + -- m_fd.created, + -- m_fd.changed, + -- m_fd.default_langcode, + -- m_fd.revision_translation_affected, + -- IFNULL (m_mf.bundle, IFNULL(m_mi.bundle, m_ml.bundle)) as bundle, -- where m_fd.bundle != IFNULL (m_mf.bundle, IFNULL(m_mi.bundle, m_ml.bundle)) returns zero mathes + -- IFNULL (m_mf.deleted, IFNULL(m_mi.deleted, m_ml.deleted)) as deleted, + -- IFNULL (m_mf.entity_id, IFNULL(m_mi.entity_id, m_ml.entity_id)) as entity_id, + -- IFNULL (m_mf.revision_id, IFNULL(m_mi.revision_id, m_ml.revision_id)) as revision_id, + -- IFNULL (m_mf.langcode, IFNULL(m_mi.langcode, m_ml.langcode)) as langcode, -- where m_fd.langcode != IFNULL (m_mf.langcode, IFNULL(m_mi.langcode, m_ml.langcode)) returns zero mathes + -- IFNULL (m_mf.delta, IFNULL(m_mi.delta, m_ml.delta)) as delta, + IFNULL (m_mf.field_media_file_target_id, IFNULL(m_mi.field_media_image_target_id, m_ml.field_media_library_target_id)) as target_id + -- m_mf.field_media_file_display, + -- m_mf.field_media_file_description, + -- m_mi.field_media_image_alt, + -- m_mi.field_media_image_title, + -- m_mi.field_media_image_width, + -- m_mi.field_media_image_height + from media_field_data as m_fd + left join media__field_media_file as m_mf on m_fd.mid = m_mf.entity_id -- remember dep on m_fd.bundle whether it should be media__field_media_file, media__field_media_image or media__field_media_library we need to join on + left join media__field_media_image as m_mi on m_fd.mid = m_mi.entity_id + left join media__field_media_library as m_ml on m_fd.mid = m_ml.entity_id) as media_collected + left join file_managed as fm on media_collected.target_id = fm.fid), +question_attachments AS (SELECT + n_qf.entity_id, + media_attachments.uri, -- where public:// -> https://loop.sundhedogomsorg.dk/sites/loop.sundhedogomsorg.dk/files/ + media_attachments.filemime, + media_attachments.filesize, + media_attachments.created, + media_attachments.changed + FROM node__os2loop_question_file as n_qf + left join media_attachments on n_qf.os2loop_question_file_target_id = media_attachments.mid), +answer_attachment AS (SELECT + c_am.entity_id, + media_attachments.uri, -- where public:// -> https://loop.sundhedogomsorg.dk/sites/loop.sundhedogomsorg.dk/files/ + media_attachments.filemime, + media_attachments.filesize, + media_attachments.created, + media_attachments.changed + FROM comment__os2loop_question_answer_media as c_am + left join media_attachments on c_am.os2loop_question_answer_media_target_id = media_attachments.mid) -- the only entry here is null, maybe it has been deleted??, but in future there might be something + +Select -- *, + n_fd.title as title, + pa.alias as relative_public_url, + n_qc.os2loop_question_content_value as question, + DATE_FORMAT(FROM_UNIXTIME(n_fd.created), '%Y-%m-%dT%H:%i:%s') as question_created, + DATE_FORMAT(FROM_UNIXTIME(n_fd.changed), '%Y-%m-%dT%H:%i:%s') as question_changed_in_some_way, + c_qa.os2loop_question_answer_value as response, + DATE_FORMAT(FROM_UNIXTIME(c_fd.created), '%Y-%m-%dT%H:%i:%s') as response_created, + DATE_FORMAT(FROM_UNIXTIME(c_fd.changed), '%Y-%m-%dT%H:%i:%s') as response_changed_in_some_way, + IF(c_qa.os2loop_question_answer_value LIKE '%loop.sundhedogomsorg%', 1, 0) as internal_reference, + IF(c_qa.os2loop_question_answer_value LIKE '%http%' or c_qa.os2loop_question_answer_value LIKE '%href%' or c_qa.os2loop_question_answer_value LIKE '%www.%', 1, 0) as any_reference, + u_jt.os2loop_user_job_title_value as questioner_job_title, + subject_tt_fd.name as `subject`, + agg_professions.target_professions as target_professions, + agg_tags.tags as question_tags, + u_jt_c.os2loop_user_job_title_value as respondent_job_title, + IFNULL(flags.redaktionens_anbefaling,0) as editor_choice, -- add if null -> 0 + IFNULL(flags.likes,0) as likes, -- if null ->0 + -- (, editor_edited) + -- # images/media: + -- table comment__os2loop_question_answer_media only contains one line and the media here I cannot dereference + -- it seem that it should be referenced in the bundle os2loop_media_file (according to table media) corresponding to table media__field_media_file, but here the reference is not among the entity_ids + -- The reference could also not be found in any of the others media__field's nor file_manage + question_attachments.uri as question_attachment_uri, -- where public:// -> https://loop.sundhedogomsorg.dk/sites/loop.sundhedogomsorg.dk/files/ + question_attachments.filemime as question_attachment_filemime, + question_attachments.filesize as question_attachment_filesize, + DATE_FORMAT(FROM_UNIXTIME(question_attachments.created), '%Y-%m-%dT%H:%i:%s') as question_attachment_created, + DATE_FORMAT(FROM_UNIXTIME(question_attachments.changed), '%Y-%m-%dT%H:%i:%s') as question_attachment_changed + /* Not included as of export 2/11 - 2023 there are no attachments in answers + answer_attachment.uri as answer_attachment_uri, -- where public:// -> https://loop.sundhedogomsorg.dk/sites/loop.sundhedogomsorg.dk/files/ + answer_attachment.filemime as answer_attachment_filemime, + answer_attachment.filesize as answer_attachment_filesize, + answer_attachment.created as answer_attachment_created, + answer_attachment.changed as answer_attachment_changed + */ +from ( + SELECT nid,vid,type,uid,title,created,changed + FROM node_field_data + where type='os2loop_question' + ) as n_fd +left join path_alias as pa on CONCAT('/node/',n_fd.nid) = pa.path +left join node__os2loop_question_content as n_qc on n_fd.nid = n_qc.entity_id +left join user__os2loop_user_job_title as u_jt on n_fd.uid = u_jt.entity_id +left join node__os2loop_shared_subject as n_ss on n_fd.nid = n_ss.entity_id +left join taxonomy_term_field_data as subject_tt_fd on n_ss.os2loop_shared_subject_target_id = subject_tt_fd.tid +left join (SELECT + n_sp.entity_id, + CONVERT(JSON_ARRAYAGG(target_prof_tt_fd.name) USING latin1) as target_professions + FROM node__os2loop_shared_profession as n_sp + LEFT JOIN taxonomy_term_field_data as target_prof_tt_fd on n_sp.os2loop_shared_profession_target_id = target_prof_tt_fd.tid + WHERE n_sp.bundle = 'os2loop_question' + group by n_sp.entity_id) AS agg_professions on n_fd.nid = agg_professions.entity_id +left join (SELECT + n_st.entity_id, + CONVERT(JSON_ARRAYAGG(tag_tt_fd.name) USING latin1) as tags + FROM node__os2loop_shared_tags as n_st + left join taxonomy_term_field_data as tag_tt_fd on n_st.os2loop_shared_tags_target_id = tag_tt_fd.tid + where n_st.bundle = 'os2loop_question' + group by n_st.entity_id) as agg_tags on n_fd.nid = agg_tags.entity_id +left join question_attachments on n_fd.nid = question_attachments.entity_id +left join comment_field_data as c_fd on n_fd.nid = c_fd.entity_id +left join comment__os2loop_question_answer as c_qa on c_fd.cid = c_qa.entity_id +left join user__os2loop_user_job_title as u_jt_c on c_fd.uid = u_jt_c.entity_id +left join (SELECT + entity_id, + MAX( IF (flag_id = 'os2loop_upvote_correct_answer', True, False)) as redaktionens_anbefaling, + MAX( IF (flag_id = 'os2loop_upvote_upvote_button', count, 0)) as likes + FROM flag_counts + where entity_type = 'comment' + group by entity_id + -- maybe and flag_id = 'os2loop_upvote_correct_answer' + ) as flags on flags.entity_id = c_fd.cid +left join answer_attachment on c_fd.cid = answer_attachment.entity_id +-- where +-- and (c_qa.os2loop_question_answer_value LIKE '%http%' or c_qa.os2loop_question_answer_value LIKE '%href%') +-- where c_qa.os2loop_question_answer_value LIKE '%loop.sundhedogomsorg%' + -- c_fd.cid = '5183' + -- and n_fd.title Like 'Ved kommunikation %'; diff --git a/data-export/export_static_pages_with_meta.sql b/data-export/export_static_pages_with_meta.sql new file mode 100644 index 000000000..46b88235c --- /dev/null +++ b/data-export/export_static_pages_with_meta.sql @@ -0,0 +1,67 @@ +Select + n_fd.nid, + n_fd.type, + n_fd.title, + pa.`alias` as relative_url, + DATE_FORMAT(FROM_UNIXTIME(n_fd.created), '%Y-%m-%dT%H:%i:%s') as created, + DATE_FORMAT(FROM_UNIXTIME(n_fd.changed), '%Y-%m-%dT%H:%i:%s') as `changed`, + -- doc_author.os2loop_documents_document_autho_value, -- is this okay to share, here we name person, not just orgs + -- doc_body.os2loop_documents_document_body_value, -- all delta is zero + -- page_cont.os2loop_page_content_summary, -- no info at the moment + page_cont.os2loop_page_content_value as page_content, + sec_page_para.os2loop_section_page_free_html_value, + sec_page_para.os2loop_section_page_info_text_value, + sec_page_para.os2loop_section_page_view_header_value, + sec_page_para.os2loop_section_page_view_text_value +from ( + SELECT nid,vid,type,uid,title,created,changed + FROM node_field_data + where type in ( + 'os2loop_page', -- static pages + 'os2loop_section_page' -- only 3 nodes, but /velkommen-til-loop (landing page) and /arbejdsgange-og-vejledninger and /sundhedsfaglige-instrukser - collection pages + ) + ) as n_fd +left join path_alias as pa on CONCAT('/node/',n_fd.nid) = pa.path +-- left join node__os2loop_documents_document_autho as doc_author on n_fd.nid = doc_author.entity_id +-- left join node__os2loop_documents_document_body as doc_body on n_fd.nid = doc_body.entity_id +left join node__os2loop_page_content as page_cont on n_fd.nid = page_cont.entity_id +left join (SELECT + -- sec_page_para.bundle as sec_page_para_bundle, + -- IFNULL(sec_page_free.bundle, IFNULL(sec_page_info.bundle, sec_page_view.bundle)) as sec_page_bundle, + -- sec_page_para.deleted, + sec_page_para.entity_id, + -- sec_page_para.revision_id, sec_page_para.langcode, sec_page_para.delta, sec_page_para.os2loop_section_page_paragraph_target_id, sec_page_para.os2loop_section_page_paragraph_target_revision_id, + -- IFNull(sec_page_free.entity_id,IFNULL(sec_page_info.entity_id, sec_page_view.entity_id)) as para_entity_id, + -- IFNull(sec_page_free.revision_id, IFNULL(sec_page_info.revision_id, sec_page_view.revision_id)) as para_revision_id, + -- IFNULL(sec_page_free.deleted, IFNULL(sec_page_info.deleted, sec_page_view.deleted)) as deleted, + -- IFNull(sec_page_free.langcode, IFNULL(sec_page_info.langcode, sec_page_view.langcode)) as langcode, + -- IFNull(sec_page_free.delta, IFNULL(sec_page_info.delta, sec_page_view.delta)) as delta, + sec_page_free.os2loop_section_page_free_html_value, + sec_page_info.os2loop_section_page_info_text_value, + sec_page_view.os2loop_section_page_view_header_value, + sec_page_view.os2loop_section_page_view_text_value + -- sec_page_free.os2loop_section_page_free_html_format, + -- sec_page_info.os2loop_section_page_info_text_format, + -- sec_page_view.os2loop_section_page_view_text_format + -- par_ifd.id as para_id, par_ifd.revision_id, par_ifd.type as para_type, par_ifd.parent_id as para_parent_id, par_ifd.parent_type, par_ifd.parent_field_name, par_ifd.langcode, par_ifd.status, par_ifd.created, par_ifd.behavior_settings, par_ifd.default_langcode, par_ifd.revision_translation_affected, + FROM node__os2loop_section_page_paragraph as sec_page_para + -- ::: the info in paragraphs_item_field_data are not needed here ::: + -- left join (SELECT * + -- FROM paragraphs_item_field_data + -- where parent_field_name = 'os2loop_section_page_paragraph') as par_ifd on sec_page_para.os2loop_section_page_paragraph_target_id = par_ifd.id + left join paragraph__os2loop_section_page_free_html as sec_page_free on sec_page_para.os2loop_section_page_paragraph_target_id = sec_page_free.entity_id + left join paragraph__os2loop_section_page_info_text as sec_page_info on sec_page_para.os2loop_section_page_paragraph_target_id = sec_page_info.entity_id + -- left join paragraph__os2loop_section_page_title -- empty table, so no paragraph section page titles + left join (SELECT + -- sec_page_header.bundle, + sec_page_header.entity_id, + -- sec_page_header.revision_id, + -- sec_page_header.deleted, + -- sec_page_header.langcode, + -- sec_page_header.delta, + sec_page_header.os2loop_section_page_view_header_value, + sec_page_text.os2loop_section_page_view_text_value, + sec_page_text.os2loop_section_page_view_text_format + FROM paragraph__os2loop_section_page_view_header as sec_page_header + left join paragraph__os2loop_section_page_view_text as sec_page_text on sec_page_header.entity_id = sec_page_text.entity_id) as sec_page_view on sec_page_para.os2loop_section_page_paragraph_target_id = sec_page_view.entity_id + ) as sec_page_para on n_fd.nid = sec_page_para.entity_id