Skip to content

Commit

Permalink
Merge 00X-controlfields (#142) (#150)
Browse files Browse the repository at this point in the history
* Merge 00X-controlfields (006, 007, 008)

* Allow 1XX vs 7XX to merge even if 7XX has a $9 <KEEP> subfield. NB! $9 <KEEP> is not merged into 1XX.

* Update deps

* 2.1.0-alpha.1

Co-authored-by: nvolk <[email protected]>
  • Loading branch information
ammsalme and nvolk authored Feb 15, 2024
1 parent d8f7928 commit bce5ee7
Show file tree
Hide file tree
Showing 245 changed files with 3,478 additions and 365 deletions.
487 changes: 251 additions & 236 deletions package-lock.json

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"url": "[email protected]:natlibfi/melinda-marc-record-merge-reducers-js.git"
},
"license": "LGPL-3.0+",
"version": "2.0.25",
"version": "2.1.0-alpha.1",
"main": "./dist/index.js",
"engines": {
"node": ">=18"
Expand All @@ -39,10 +39,10 @@
"@natlibfi/marc-record": "^8.1.0",
"@natlibfi/marc-record-merge": "^7.0.2",
"@natlibfi/marc-record-validate": "^8.0.6",
"@natlibfi/marc-record-validators-melinda": "^10.15.6",
"@natlibfi/marc-record-validators-melinda": "^10.16.0",
"@natlibfi/melinda-commons": "^13.0.12",
"debug": "^4.3.4",
"isbn3": "^1.1.44",
"isbn3": "^1.1.45",
"normalize-diacritics": "^4.0.3"
},
"devDependencies": {
Expand All @@ -59,7 +59,7 @@
"chai": "^4.4.1",
"cross-env": "^7.0.3",
"eslint": "^8.56.0",
"mocha": "^10.2.0",
"mocha": "^10.3.0",
"nodemon": "^3.0.3",
"nyc": "^15.1.0"
},
Expand Down
115 changes: 54 additions & 61 deletions src/reducers/controlFieldUtils.js
Original file line number Diff line number Diff line change
@@ -1,31 +1,13 @@
import {nvdebug} from './utils';
import createDebugLogger from 'debug';
//import {nvdebug} from './utils';
//import createDebugLogger from 'debug';

const debug = createDebugLogger('@natlibfi/melinda-marc-record-merge-reducers:controlFieldUtils');
//const debug = createDebugLogger('@natlibfi/melinda-marc-record-merge-reducers:controlFieldUtils');
//const debugData = debug.extend('data');
const debugDev = debug.extend('dev');

function fieldPositionValueContainsInformation(val) {
if (val === '' || val === '|' || val === ' ' || val === '#') {
return false;
}
return true;
}

function getBetterControlFieldPositionValue(c1, c2) {
if (fieldPositionValueContainsInformation(c1)) {
return c1;
}
if (fieldPositionValueContainsInformation(c2)) {
return c2;
}
return c1;
}

//const debugDev = debug.extend('dev');

const f007Lengths = {a: 8, c: 14, d: 6, f: 10, g: 9, h: 13, k: 6, m: 23, o: 2, q: 2, r: 11, s: 14, t: 2, v: 9, z: 2};

function hasLegalLength(field) {
export function hasLegalLength(field) {
if (field.tag === '006') {
return field.value.length === 18;
}
Expand All @@ -34,62 +16,73 @@ function hasLegalLength(field) {
if (field.tag === '007') {
const c0 = field.value.charAt(0);
if (c0 in f007Lengths) {
nvdebug(`${c0}: COMPARE ${f007Lengths[c0]} vs ${field.value.length}`, debugDev);
//nvdebug(`${c0}: COMPARE ${f007Lengths[c0]} vs ${field.value.length}`, debugDev);
return field.value.length === f007Lengths[c0];
}

return false;
return false; // Sanity check. It's ok that no test reaches this poin.
}

if (field.tag === '008') {
return field.value.length === 40;
}

return false;
return false; // Again: a sanity check. No test should reach this point.
}

export function isFillableControlFieldPair(baseField, sourceField) {
if (baseField.value.length !== sourceField.value.length) {
return false;
}
if (!hasLegalLength(baseField)) {
return false;
}

if (baseField.tag === '006' && baseField.value[0] !== sourceField.value[0]) {
return false;
export function genericControlFieldCharPosFix(baseField, sourceField, baseTypeOfMaterial, sourceTypeOfMaterial, rule) { // eslint-disable-line max-params
// Initially written fro field 008, but may be applied to 006 and 007 as well (I guess).
// We apply some rules (eg. for government publication) even if baseTypeOfMaterial !== sourceTypeOfMaterial
if (!rule.types.includes(baseTypeOfMaterial) || !rule.types.includes(sourceTypeOfMaterial) || rule.validateOnly) {
return;
}
//console.info(`Apply ${'description' in rule ? rule.description : 'nameless'} rule`); // eslint-disable-line no-console
const legalValues = rule.prioritizedValues;
const position = baseField.tag === '006' ? rule.startPosition - 17 : rule.startPosition; // Field 006 uses rules writted for field 008. 006/01=008/18 etc.
const valueForUnknown = 'valueForUnknown' in rule ? rule.valueForUnknown : undefined;
const [noAttemptToCode] = rule.noAttemptToCode;

if (baseField.tag === '007') {
// 007/00 values must be equal:
if (baseField.value.charAt(0) !== sourceField.value.charAt(0)) {
return false;
}

// 007/01 values must match or contain '|' (undefined):
if (baseField.value.charAt(1) === sourceField.value.charAt(1) || sourceField.value.charAt(1) === '|' || baseField.value.charAt(1) === '|') {
return true;
}
}
const len = legalValues.length > 0 ? legalValues[0].length : noAttemptToCode.length;

const arr1 = baseField.value.split('');
const arr2 = sourceField.value.split('');
if (arr1.every((c, i) => c === arr2[i] || !fieldPositionValueContainsInformation(c) || !fieldPositionValueContainsInformation(arr2[i]))) {
return true;
}
return false;
}
const baseValue = baseField.value.substring(position, position + len);
const sourceValue = sourceField.value.substring(position, position + len);

export function fillControlFieldGaps(baseField, sourceField, min = 0, max = 39) {
// NB! Mergability must be checked before calling this!
//console.info(`${position}: '${baseValue}' vs '${sourceValue}', UNKNOWN: '${valueForUnknown}', type of material: ${typeOfMaterial}`); // eslint-disable-line no-console
//console.info(`Consider ${'description' in rule ? rule.description : 'unnamed'} rule at ${rule.startPosition}:\n'${fieldToString(baseField)}' +\n'${fieldToString(sourceField)}' =`); // eslint-disable-line no-console

if (baseField.value.length !== sourceField.value.length) {
if (applyFix()) {
//console.info(`Apply ${'description' in rule ? rule.description : 'unnamed'} rule at ${rule.startPosition}:\n'${fieldToString(baseField)}' +\n'${fieldToString(sourceField)}' =`); // eslint-disable-line no-console
baseField.value = `${baseField.value.substring(0, position)}${sourceValue}${baseField.value.substring(position + len)}`; // eslint-disable-line functional/immutable-data
//console.info(`'${fieldToString(baseField)}'`); // eslint-disable-line no-console
return;
}
const arr1 = baseField.value.split('');
const arr2 = sourceField.value.split('');
return;

const mergedCharArray = arr1.map((c, i) => i < min || i > max ? c : getBetterControlFieldPositionValue(c, arr2[i]));
function applyFix() {
if (baseValue === sourceValue || legalValues.includes(baseValue)) {
return false;
}
if (legalValues.includes(sourceValue)) {
return true;
}
if (valueForUnknown) {
if (baseValue === valueForUnknown) {
return false;
}
if (sourceValue === valueForUnknown) {
return true;
}
}
if (noAttemptToCode) {
if (baseValue === noAttemptToCode) {
return false;
}
if (sourceValue === noAttemptToCode) {
return true;
}
}
//console.info(`DEFAULT:don't apply fix for ${baseValue} vs ${sourceValue}`); // eslint-disable-line no-console
return false;
}

baseField.value = mergedCharArray.join(''); // eslint-disable-line functional/immutable-data
}
4 changes: 4 additions & 0 deletions src/reducers/controlSubfields.js
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,10 @@ function controlSubfield9PermitsMerge(baseField, sourceField) {
return false;
}

if (baseField.tag.charAt(0) === '1' && !keepOrDrop2.some(sf => (/<DROP>/u).test(sf.value))) {
return false;
}

const sf9lessField1 = baseField.subfields.filter(subfield => retainSubfieldForKeepComparison(subfield));
const sf9lessField2 = sourceField.subfields.filter(subfield => retainSubfieldForKeepComparison(subfield));

Expand Down
162 changes: 152 additions & 10 deletions src/reducers/field006.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import createDebugLogger from 'debug';
import {MarcRecord} from '@natlibfi/marc-record';
import {copyFields, nvdebug} from './utils.js';
import {fillControlFieldGaps, isFillableControlFieldPair} from './controlFieldUtils.js';

import {genericControlFieldCharPosFix, hasLegalLength} from './controlFieldUtils.js';
import {getSingleCharacterPositionRules, isSpecificLiteraryForm, setFormOfItem, setLiteraryForm} from './field008.js';
// Test 02: If Leader 000/06 is 'o' or 'p' in source, copy 006 from source to base as new field (2x)
// Test 03: If Leader 000/06 is something else, do nothing

Expand All @@ -21,12 +21,10 @@ export default () => (base, source) => {
const baseFields = baseRecord.get(/^006$/u);
const sourceFields = sourceRecord.get(/^006$/u);

// If both sides have same number of entries,
// and they apparently are in the same order,
// let's try to fill the gaps:
// If both sides have same number of entries, and they apparently are in the same order, let's try to fill the gaps:
if (baseFields.length > 0 && baseFields.length === sourceFields.length) {
if (baseFields.every((baseField, i) => isFillableControlFieldPair(baseField, sourceFields[i]))) { // eslint-disable-line functional/no-conditional-statements
baseFields.forEach((baseField, i) => fillControlFieldGaps(baseField, sourceFields[i]));
if (baseFields.every((baseField, i) => areMergable006Pair(baseField, sourceFields[i]))) { // eslint-disable-line functional/no-conditional-statements
baseFields.forEach((baseField, i) => fillField006Gaps(baseField, sourceFields[i]));
}
return {base: baseRecord, source};
}
Expand All @@ -38,8 +36,152 @@ export default () => (base, source) => {
return {base: baseRecord, source};
}

// Defy specs: don't copy non-identical fields. Typically we should have only one 007 field.
// And don't merge them either, as it is too risky. Let's just trust base record.
// Defy specs: don't copy non-identical fields. Typically (but not always) we should have only one 006 field.
// Default behaviour: merging is too risky (might describe different materials), so let's just trust base record.
return {base: baseRecord, source};

};

const singleCharacterPositionRules = getSingleCharacterPositionRules();

function fillField006Gaps(baseField, sourceField) {
if (!hasLegalLength(baseField) && hasLegalLength(sourceField)) {
baseField.value = sourceField.value; // eslint-disable-line functional/immutable-data
return;
}
const typeOfMaterial = mapFieldToTypeOfMaterial(baseField);
singleCharacterPositionRules.forEach(rule => mergeTwo006Fields(baseField, sourceField, typeOfMaterial, rule));
setFormOfItem(baseField, sourceField, typeOfMaterial, typeOfMaterial);
setLiteraryForm(baseField, sourceField, typeOfMaterial, typeOfMaterial);
//console.info(`FINAL:\n${fieldToString(baseField)}`); // eslint-disable-line no-console
}

function mergeTwo006Fields(baseField, sourceField, typeOfMaterial, rule) {
//console.info(`Apply ${'description' in rule ? rule.description : 'unnamed'} rule at ${rule.startPosition}:\n'${fieldToString(baseField)}' +\n'${fieldToString(sourceField)}' =`); // eslint-disable-line no-console
genericControlFieldCharPosFix(baseField, sourceField, typeOfMaterial, typeOfMaterial, rule);
//console.info(`'${fieldToString(baseField)}'`); // eslint-disable-line no-console
}

function areMergable006Pair(field1, field2) {
// NB! We explicitly assume that only tag=006 stuff gets this far!
// Check 006/00:
if (field1.value[0] !== field2.value[0] || !hasLegalLength(field2)) {
return false;
}
const typeOfMaterial = mapFieldToTypeOfMaterial(field1);
if (!typeOfMaterial) { // Must map to some type of material
return false;
}

if (!hasLegalLength(field1)) {
return true; // If base has illegal size, use source...
}

if (field1.value.length !== field2.value.length) {
return false;
}
if (!hasLegalLength(field1)) {
return false;
}
// By default, we try to merge 008/18-34. However we are much stricter with 006 pairs, as we can not be sure they mean the same thing...
// (There is always one 008, but 006 has 0...n instances.) Thus this does not allow any subsetting etc of, say, BK 006/07-10.
// We should improve order stuff etc., but let's start with overstrict implementation, as the problem is largely theoretical.
// The proper solution will eventually be done in field008.js. We can then decide whether we can to use it in 006 as well.

const arr1 = field1.value.split('');
const arr2 = field2.value.split('');
if (arr1.every((c, i) => c === arr2[i] || !field006PositionValueContainsInformation(c, i) || !field006PositionValueContainsInformation(arr2[i], i) || isException(c, arr2[i], i))) {
return true;
}

return false;

function isException(c1, c2, characterPosition) {
// (NB! We know that c1/c2 at character position means the same for both (type of record is always same) as base 006/00 must be source 006/00)
if (characterPosition === 6) {
// 'o' (online resource)and 'q' are subsets of 'p'
if (['BK', 'CR', 'MU', 'MX'].includes(typeOfMaterial)) {
if (['o', 'q'].includes(c1) && c2 === 's') {
return true;
}
if (['o', 'q'].includes(c2) && c1 === 's') {
return true;
}
}
}

if (characterPosition === 16 && typeOfMaterial === 'BK') {
if (c1 === '1' && isSpecificLiteraryForm(c2)) {
return 1;
}
if (c2 === '1' && isSpecificLiteraryForm(c1)) {
return 1;
}
}
return false;
}

function field006PositionValueContainsInformation(c, position) {
//console.info(`006/${position}: '${c}' (${typeOfMaterial})`); // eslint-disable-line no-console
if (c === '|') {
return false;
}

if (c === ' ') { // Typically false, but there are some notable exceptions:
return spaceContainsInformation(position);
}

// Compare variable c against relevant rule.valueForUnknown values (NB! We should implement similar rule for field 006):
const relevantRules = singleCharacterPositionRules.filter(rule => rule.types.includes(typeOfMaterial) && rule.startPosition - 17 === position);
if (relevantRules.length === 0) { // Not interested
return false;
}
if (relevantRules.some(rule => 'valueForUnknown' in rule && rule.valueForUnknown === c)) {
return false;
}

return true;
}

function spaceContainsInformation(position) {
// All/some of these should be checked via rules...
if (position === 1 && typeOfMaterial === 'CR') { // 008/18 frequency
return true;
}
if (position === 4 && typeOfMaterial === 'CR') { // 008/21 type of continuing resource
return true;
}
// Skip map 006/05-06 on purpose
if ([5, 6].includes(position) && typeOfMaterial === 'MP') { // 008/22 form of original item
return true;
}
if (position === 6 && ['BK', 'CR', 'MU', 'MX'].includes(typeOfMaterial)) { // 008/23 form of item '#' means "none of the following" 008/23
return true;
}
if (position === 7 && typeOfMaterial === 'CR') { // 008/22 nature of entire work
return true;
}
if (position === 11 && ['BK', 'CF', 'CR', 'MP', 'VM'].includes(typeOfMaterial)) { // 008/28 government publication
return true;
}
if (position === 12 && ['MP', 'VM'].includes(typeOfMaterial)) { // 008/29 form of item '#' means "none of the following"
return true;
}
if (position === 13 && ['MU'].includes(typeOfMaterial)) { // 008/30 Literary text for sound recordings (code 1) (008/31 code is fine/meaningless, if 008/30 is a-z...)
return true;
}
if (position === 17 && typeOfMaterial === 'BK') { // 008/34 technique
return true;
}
return false;
}
}

const map06CharPos00ToTypeOfMaterial = {'a': 'BK', 'c': 'MU', 'd': 'MU', 'e': 'MP', 'f': 'MP', 'g': 'VM', 'i': 'MU', 'j': 'MU', 'k': 'VM', 'm': 'CF', 'o': 'VM', 'p': 'MX', 'r': 'VM', 's': 'CR', 't': 'BK'};

function mapFieldToTypeOfMaterial(field) {
const c = field.value.charAt(0); // stuupid eslint complains about field.value[0]...
if (c in map06CharPos00ToTypeOfMaterial) {
return map06CharPos00ToTypeOfMaterial[c];
}
return undefined;
}
Loading

0 comments on commit bce5ee7

Please sign in to comment.