Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into deps
Browse files Browse the repository at this point in the history
# Conflicts:
#	yarn.lock
  • Loading branch information
tpluscode committed Nov 4, 2024
2 parents 612232a + 9fbee49 commit 7931af9
Show file tree
Hide file tree
Showing 8 changed files with 799 additions and 226 deletions.
6 changes: 6 additions & 0 deletions .changeset/bright-goats-smell.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"@cube-creator/model": patch
"@cube-creator/core-api": patch
---

Guess default column datatype from CSV sample values
2 changes: 2 additions & 0 deletions apis/core/lib/domain/table/Table.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import * as ColumnMapping from '@cube-creator/model/ColumnMapping'
import $rdf from 'rdf-ext'
import slug from 'slug'
import { Link } from '@cube-creator/model/lib/Link'
import { inferDatatype } from '@cube-creator/model/lib/datatypeInference'
import { ResourceStore } from '../../ResourceStore'
import * as id from '../identifiers'

Expand Down Expand Up @@ -92,6 +93,7 @@ export default function mixin<Base extends Constructor<Table>>(Resource: Base):
return this.addLiteralColumnMapping({
store,
sourceColumn: column,
datatype: inferDatatype(column.samples),
targetProperty: $rdf.literal(slug(column.name)),
})
}
Expand Down
12 changes: 10 additions & 2 deletions mocha-setup.js
Original file line number Diff line number Diff line change
@@ -1,22 +1,30 @@
/* eslint-disable import/no-unresolved */
/* eslint-disable import/no-extraneous-dependencies */
/* eslint-disable @typescript-eslint/no-var-requires */
require('@babel/register')({
configFile: './babel.config.json',
extensions: ['.js', '.jsx', '.ts', '.tsx'],
})

require('dotenv').config({
path: require('path').resolve(__dirname, '.local.env')
path: require('path').resolve(__dirname, '.local.env'),
})

require('chai-snapshot-matcher')
const chai = require('chai')
const sinonChai = require('sinon-chai')
const quantifiers = require('chai-quantifiers')

var chaiAsPromised = require('chai-as-promised')
const chaiAsPromised = require('chai-as-promised')
chai.use(chaiAsPromised)
chai.use(quantifiers)

require('./packages/testing/lib/chaiShapeMatcher')

chai.use(sinonChai)

// Dynamically import mocha-chai-rdf
;(async () => {
const rdfMatchers = await import('mocha-chai-rdf/matchers.js')
chai.use(rdfMatchers.default)
})()
65 changes: 65 additions & 0 deletions packages/model/lib/datatypeInference.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import { NamedNode } from '@rdfjs/types'
import { xsd } from '@tpluscode/rdf-ns-builders'
import { validators } from 'rdf-validate-datatype'

type Validator = (value: string) => boolean

const getValidator = (name: NamedNode): Validator =>
validators.find(name) ?? (() => false)

interface Datatype {
check: Validator
name: NamedNode
broader: Datatype[]
}

const getDatatype = (name: NamedNode, ...broader: Datatype[]): Datatype =>
({ name, check: getValidator(name), broader })

const getDatatypes = () => {
// avoid gDay, gMonth and gYear because they are easily confused with integer
const decimal = getDatatype(xsd.decimal)
const integer = getDatatype(xsd.integer, decimal)
const gYearMonth = getDatatype(xsd.gYearMonth)
const date = getDatatype(xsd.date)
const time = getDatatype(xsd.time)
const dateTime = getDatatype(xsd.dateTime)
const boolean = getDatatype(xsd.boolean)
// integer before decimal because decimal is broader
return [integer, decimal, date, time, dateTime, gYearMonth, boolean]
}

const nextUntil = <T>(iterator: Iterator<T>, predicate: (value: T) => boolean) => {
while (true) {
const result = iterator.next()
if (result.done || predicate(result.value)) {
return result
}
}
}

export function inferDatatype(values: Iterable<string>): NamedNode {
// get the first datatype that matches the first (non-empty) value
const valueIterator = values[Symbol.iterator]()
let currentValue = nextUntil(valueIterator, value => value !== '')
if (currentValue.done) {
return xsd.string // no values to check
}
const datatypeIterator = getDatatypes()[Symbol.iterator]()
let currentDatatype = nextUntil(datatypeIterator, type => type.check(currentValue.value))
if (currentDatatype.done) {
return xsd.string // no datatype found that matches the first value
}
// iterate over the rest of the values, moving to broader types if needed
while (true) {
currentValue = nextUntil(valueIterator, value => value !== '' && !currentDatatype.value.check(value))
if (currentValue.done) {
return currentDatatype.value.name // all values successfuly checked
}
// look for broader types
currentDatatype = nextUntil(currentDatatype.value.broader[Symbol.iterator](), type => type.check(currentValue.value))
if (currentDatatype.done) {
return xsd.string // no broader type found that matches the value
}
}
}
7 changes: 5 additions & 2 deletions packages/model/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,21 @@
"@rdfine/rdfs": "^0.6.3",
"@rdfine/schema": "^0.6.3",
"@rdfine/shacl": "^0.8.5",
"@rdfjs/types": "^1.1.0",
"@tpluscode/rdf-ns-builders": "^1.0.0",
"@tpluscode/rdfine": "^0.5.19",
"is-uri": "^1.2.0",
"rdf-validate-datatype": "^0.1.3",
"uri-template": "^1.0.1"
},
"devDependencies": {
"@cube-creator/testing": "^0.1.21",
"@types/clownface": "^1",
"@types/is-uri": "^1",
"@rdfjs/types": "^1.1.0",
"@types/rdf-validate-datatype": "^0.1",
"alcaeus": "^2",
"chai": "^4.3.4",
"mocha": "^10"
"mocha": "^10",
"mocha-chai-rdf": "^0.1.5"
}
}
56 changes: 56 additions & 0 deletions packages/model/test/lib/datatypeInference.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import { describe, it } from 'mocha'
import { expect } from 'chai'
import { xsd } from '@tpluscode/rdf-ns-builders'
import { inferDatatype } from '../../lib/datatypeInference'

describe('@cube-creator/model/DatatypeChecker', () => {
it('recognize xsd:integer', () => {
expect(inferDatatype(['42'])).to.eq(xsd.integer)
})
it('recognize xsd:decimal', () => {
expect(inferDatatype(['42.1'])).to.eq(xsd.decimal)
})
it('recognize xsd:boolean', () => {
// if the first value was 0 or 1, it would be considered as xsd:integer
expect(inferDatatype(['true', 'false', '0', '1'])).to.eq(xsd.boolean)
})
it('recognize xsd:date', () => {
expect(inferDatatype(['2021-01-01'])).to.eq(xsd.date)
})
it('recognize xsd:time', () => {
expect(inferDatatype(['23:57:05'])).to.eq(xsd.time)
})
it('recognize xsd:dateTime', () => {
expect(inferDatatype(['2021-01-01T23:57:05'])).to.eq(xsd.dateTime)
})
it('recognize xsd:gYearMonth', () => {
expect(inferDatatype(['2021-12'])).to.eq(xsd.gYearMonth)
})
it('recognize xsd:string', () => {
expect(inferDatatype(['abc'])).to.eq(xsd.string)
})
it('recognize two xsd:integer values', () => {
expect(inferDatatype(['42', '42'])).to.eq(xsd.integer)
})
it('recognize xsd:string with empty array', () => {
expect(inferDatatype([])).to.eq(xsd.string)
})
it('recognize xsd:string with empty string', () => {
expect(inferDatatype([''])).to.eq(xsd.string)
})
it('recognize xd:integer ignoring empty strings', () => {
expect(inferDatatype(['', '42', ''])).to.eq(xsd.integer)
})
it('recognize xsd:string after xsd:date', () => {
expect(inferDatatype(['2021-01-01', 'foo'])).to.eq(xsd.string)
})
it('recognize xsd:decimal after xsd:integer', () => {
expect(inferDatatype(['42', '42.1'])).to.eq(xsd.decimal)
})
it('recognize xsd:string after xsd:integer', () => {
expect(inferDatatype(['42', 'foo'])).to.eq(xsd.string)
})
it('recognize xd:string when mixed types', () => {
expect(inferDatatype(['', '42', '2021-01-01'])).to.eq(xsd.string)
})
})
3 changes: 2 additions & 1 deletion tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
"include": [
"packages",
"apis",
"cli"
"cli",
"mocha-setup.js"
],
"exclude": [
"node_modules"
Expand Down
Loading

0 comments on commit 7931af9

Please sign in to comment.