-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #18 from salsadigitalauorg/develop
Release/0.2.0
- Loading branch information
Showing
38 changed files
with
1,395 additions
and
311 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -74,15 +74,56 @@ jobs: | |
--tag $CIRCLE_TAG \ | ||
--name merlin-framework \ | ||
--file /tmp/merlin-framework.phar | ||
deploy_docs: | ||
docker: | ||
- image: circleci/php:7.3-stretch-node-browsers | ||
steps: | ||
- checkout | ||
- checkout: | ||
path: /tmp/docs | ||
- run: | ||
name: "Deploy docs" | ||
command: | | ||
git config --global user.email "[email protected]" | ||
git config --global user.name "Website Deployment Script" | ||
git -C /tmp/docs checkout --track origin/docs | ||
npm --prefix=/tmp/docs/website install | ||
./.circleci/scripts/docs-sidebar /tmp/docs | ||
cp ~/project/docs/* /tmp/docs/docs | ||
cd /tmp/docs/website | ||
npm run version $CIRCLE_TAG | ||
cd /tmp/docs | ||
git add . | ||
git commit -m "Automated documentation generation" | ||
git push origin docs -f | ||
cd /tmp/docs/website | ||
CURRENT_BRANCH=docs npm run publish-gh-pages | ||
workflows: | ||
version: 2 | ||
main: | ||
jobs: | ||
- build | ||
- build: | ||
filters: | ||
branches: | ||
ignore: | ||
- docs | ||
- deploy: | ||
filters: | ||
branches: | ||
ignore: /.*/ | ||
tags: | ||
only: /^\d+\.\d+\.\d+$/ | ||
- deploy_docs: | ||
filters: | ||
branches: | ||
ignore: /.*/ | ||
tags: | ||
only: /^\d+\.\d+\.\d+$/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#!/usr/local/bin/php | ||
|
||
<?php | ||
/** | ||
* | ||
*/ | ||
$docs_dir = $argv[1]; | ||
|
||
if (!file_exists("$docs_dir/website/sidebars.json")) { | ||
echo "Invalid documentation directory."; | ||
exit(1); | ||
} | ||
|
||
$dir = new RecursiveDirectoryIterator(__DIR__ . "/../../docs"); | ||
$iterator = new RecursiveIteratorIterator($dir, RecursiveIteratorIterator::SELF_FIRST); | ||
|
||
$menu_configuration = [ | ||
"docs" => [ | ||
'Introduction' => [], | ||
'Types' => [], | ||
'Processors' => [], | ||
], | ||
]; | ||
|
||
foreach ($iterator as $file) { | ||
if ($file->isFile()) { | ||
$contents = file_get_contents($file->getPathname()); | ||
$id = []; | ||
preg_match("/id:\s([-\w]+)/", $contents, $id); | ||
if (empty($id[1])) { | ||
// Not a valid doc file. | ||
continue; | ||
} | ||
preg_match("/weight:\s([-\d]+)/", $contents, $weight); | ||
$weight = empty($weight[1]) ? 0 : $weight[1]; | ||
$menu_configuration['docs'][get_menu_key($id[1])][$id[1]] = $weight; | ||
} | ||
} | ||
|
||
foreach ($menu_configuration['docs'] as $type => &$links) { | ||
asort($links); | ||
$links = array_keys($links); | ||
} | ||
|
||
echo "Updated sidebar!" . PHP_EOL; | ||
file_put_contents("$docs_dir/website/sidebars.json", json_encode($menu_configuration, JSON_PRETTY_PRINT)); | ||
exit(0); | ||
|
||
/** | ||
* Get the doctype for a file this will be used to write the sidebar menu. | ||
* | ||
* @return string | ||
* The menu key. | ||
*/ | ||
function get_menu_key($id) | ||
{ | ||
$parts = explode('-', $id); | ||
$type = reset($parts); | ||
|
||
switch ($type) { | ||
case 'processor': | ||
return 'Processors'; | ||
case 'type': | ||
return 'Types'; | ||
default: | ||
return 'Introduction'; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
--- | ||
id: examples | ||
title: Examples | ||
--- | ||
|
||
# Menu | ||
|
||
Menu structures use the `menu_link` type. This sample configuration demonstrates how to pull the main menu from the Health.vic site, with parent/child relationships in-tact. | ||
|
||
The selector uses an Xpath to reference the element in the DOM, in this case all list-items contained in the header nav are evaluated for top level links. The `text` and `link` options are sub-selectors to help define where link text and link values should come from. | ||
|
||
The `children` section allows for sub-menu items to be defined via their own `selector` and configuration. | ||
|
||
``` | ||
--- | ||
domain: https://www2.health.vic.gov.au | ||
urls: | ||
- / | ||
entity_type: menus | ||
mappings: | ||
- | ||
field: main_menu | ||
name: health_main_menu | ||
type: menu_link | ||
selector: '//*[@class="header-nav"]/*/ul/li' | ||
options: | ||
text: './a' | ||
link: './a/@href' | ||
remove_duplicates: true | ||
children: | ||
- | ||
type: menu_link | ||
selector: './descendant::li[@class="dd-level2"]' | ||
options: | ||
text: './a/h3' | ||
link: './a/@href' | ||
``` | ||
|
||
# URL aliases | ||
|
||
The URL alias of each content should be preserved so URLs can remain in-tact when migrated into the destination CMS. Simply attach the `alias` type to the mappings configuration to ensure URL aliases are captured. | ||
|
||
``` | ||
mappings: | ||
- | ||
field: alias | ||
type: alias | ||
``` | ||
|
||
|
||
# Basic text | ||
|
||
Basic text fields can be mapped in the `mappings` section using the `text` type. Example configuration below: | ||
|
||
``` | ||
mappings: | ||
- | ||
field: title | ||
selector: "#phbody_1_ctl01_h1Title" | ||
type: text | ||
``` | ||
|
||
This type was used for the 'key messages' content. It supports both individual items, or arrays of items, e.g in the case of key messages there are multiple matches on the selector, so an array of plain-text results will exist in the JSON object for import. | ||
|
||
``` | ||
mappings: | ||
- | ||
field: field_key_messages | ||
selector: .m-key-messages .m-b li | ||
type: text | ||
processors: | ||
convert_encoding: | ||
to_encoding: "HTML-ENTITIES" | ||
from_encoding: UTF-8 | ||
html_entity_decode: { } | ||
whitespace: { } | ||
``` | ||
|
||
This also includes additional processors, more detail on these can be found on the [Processors]() page. | ||
|
||
# Long, formatted text | ||
|
||
Long text is used for body content, or anywhere a rich-text WYSIWYG editor may be used. It also allows for embedded media (e.g documents, images). | ||
|
||
This content will generally pass through multiple processors to ensure clean markup, and optionally allows for stripping undesirable attributes or tags. | ||
|
||
The below example would capture an entire body of content found within the `#main` div, removing non-standard tags, removing empty tags, and stripping whitespace. | ||
|
||
``` | ||
mappings: | ||
- | ||
field: field_paragraph_body | ||
selector: '//*[@id="main"]' | ||
type: long_text | ||
processors: | ||
- processor: remove_empty_tags | ||
- | ||
processor: convert_encoding | ||
to_encoding: HTML-ENTITIES | ||
from_encoding: UTF-8 | ||
- | ||
processor: strip_tags | ||
allowed_tags: <h1><h2><h3><h4><h5><ul><ol><dl><dt><dd><li><p><a><strong><em><cite><blockquote><code><s><span><sup><sub><table><caption><tbody><thead><tfoot><th><td><tr><hr><pre><drupal-entity><br> | ||
remove_attr: | ||
- class | ||
- id | ||
- style | ||
- processor: whitespace | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
--- | ||
id: getting-started | ||
title: Getting Started | ||
weight: -1 | ||
--- | ||
|
||
The migration tool provides a standard mechanism for scraping content from DHHS websites, split into logical content structures, and perform additional processing to ensure a result ready for import into Drupal. | ||
|
||
- Initial code is available on https://github.com/salsadigitalauorg/merlin-framework | ||
- As this codebase is likely to be open-sourced and see ongoing development effort the branch `<TBD>` is the safest to use with DHHS migration configurations | ||
|
||
|
||
# Core concepts | ||
|
||
The migration framework expects to take a YAML (.yml) file containing all the configuration required for a migration run. A separate migration configuration exists for each logical content structure split, for example these may be: | ||
- Menus | ||
- Content Type A | ||
- Content Type B | ||
- Taxonomy A | ||
- Taxonomy B | ||
- .. etc | ||
|
||
Each configuration file contains a reference to either a website domain and list of URLs, or a path to relevant XML files (see [XML File Support]()). | ||
|
||
Content from these sources are then passed through mappings, which take selectors (XPath or JQuery-like selectors) to map content from the DOM to the JSON file that gets generated during a run. These data values can also pass through processors to further refine and alter the data. | ||
|
||
# Prerequisites | ||
The framework requires PHP (latest recommended, but tested on most versions of 7.x) and composer. All other dependencies will be pulled in by running a `composer install` | ||
|
||
# Running a migration | ||
To run a migration simply run the tool with the input configuration .yml file, and a path to the output, e.g: | ||
|
||
`php migrate generate -c configs/bhc/fact_sheet.yml -o /path/to/output/` | ||
|
||
You will see output as following: | ||
``` | ||
Migration framework | ||
=================== | ||
Preparing the configuration | ||
--------------------------- | ||
[OK] Done! | ||
Processing requests | ||
------------------- | ||
Parsing... https://www.betterhealth.vic.gov.au/health/conditionsandtreatments/Treating-persistent-pain (Done!) | ||
... etc (x2000 pages) | ||
Generating files | ||
---------------- | ||
Generating /tmp/page_type.json Done! | ||
Generating /tmp/error-not-found.json Done! | ||
Generating /tmp/media-image-bhc_fact_sheet.json Done! | ||
Generating /tmp/call_to_action.json Done! | ||
Generating /tmp/content_partner.json Done! | ||
Generating /tmp/fact_sheet.json Done! | ||
Generating /tmp/error-404.json Done! | ||
Generating /tmp/media-embedded_video-bhc_fact_sheet.json Done! | ||
[OK] Done! | ||
Completed in 87.295419931412 | ||
``` | ||
|
||
## Refreshing JSON assets | ||
|
||
The resulting JSON files are now ready to push into the Drupal Migration plugins. These files should be hosted somewhere that Drupal can access, e.g a web-accessible URL. | ||
|
||
## Error handling | ||
|
||
There are JSON files generated with error reporting included. These may include `error-not-found.json`, `error-404.json` and `error-unhandled.json`. These will indicate where selectors cannot find matches on any given page, or where a URL does not resolve (404, 500, or similar). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
--- | ||
id: processor-convert-encoding | ||
title: Convert Encoding | ||
sidebar_label: Convert Encoding | ||
--- | ||
|
||
Converts character encoding of data from one type to another. This uses `mb_convert_encoding` and should allow the same values. | ||
|
||
- [phpdocs](https://www.php.net/manual/en/function.mb-convert-encoding.php) | ||
|
||
## Options | ||
|
||
- **to_encoding**`<default: UTF-8>`: The encoding to convert to. | ||
- **from_encoding**`<default: null>`: The encoding to convert form. | ||
|
||
## Usage | ||
|
||
``` | ||
processors: | ||
- | ||
processor: convert_encoding | ||
to_encoding: UTF-8 | ||
from_encoding: auto | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
--- | ||
id: processor-html-entity-decode | ||
title: Html Entity Decode | ||
sidebar_label: Html Entity Decode | ||
--- | ||
|
||
Converts HTML entities (e.g `"`) to a string. | ||
|
||
## Options | ||
|
||
Doesn't provide options. | ||
|
||
## Usage | ||
|
||
``` | ||
processors: | ||
- | ||
processor: html_entity_decode | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
--- | ||
id: processor-nl2br | ||
title: nl2br | ||
sidebar_label: nl2br | ||
--- | ||
|
||
Converts raw newlines to `<br>` markup. | ||
|
||
## Options | ||
|
||
Doesn't provide options. | ||
|
||
## Usage | ||
|
||
``` | ||
processors: | ||
- | ||
processor: nl1br | ||
``` |
Oops, something went wrong.