Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds ISL model #251

Merged
merged 1 commit into from
Apr 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import com.amazon.ionschema.IonSchemaVersion
import com.amazon.ionschema.Violation
import com.amazon.ionschema.Violations
import com.amazon.ionschema.internal.util.islRequire
import com.amazon.ionschema.internal.util.validateRegexPattern
import java.util.regex.Pattern

/**
Expand Down Expand Up @@ -59,8 +60,8 @@ internal class Regex(
}
flags = flags.or(flag)
}

pattern = toPattern(ion.stringValue(), flags)
val patternString = validateRegexPattern(ion.stringValue(), islVersion)
pattern = Pattern.compile(patternString, flags)
}

override fun validate(value: IonValue, issues: Violations) {
Expand All @@ -75,155 +76,4 @@ internal class Regex(
}
}
}

private fun toPattern(string: String, flags: Int): Pattern {
val si = StringIterator(string)
val sb = StringBuilder()
var ch = si.next()
do {
when (ch) {
'[' -> {
sb.append(ch)
parseCharacterClass(si, sb)
}
'(' -> {
sb.append(ch)
ch = si.next()
if (ch == '?') { // error on "(?..." constructs
error(si, "invalid character '$ch'")
}
sb.append(ch)
}
'\\' -> { // handle escaped chars
ch = si.next()
when (ch) {
'.', '^', '$', '|', '?', '*', '+', '\\',
'[', ']', '(', ')', '{', '}',
'w', 'W', 'd', 'D' -> sb.append('\\').append(ch)
's' -> sb.append("[ \\f\\n\\r\\t]")
'S' -> sb.append("[^ \\f\\n\\r\\t]")
else -> error(si, "invalid escape character '$ch'")
}
}
else -> sb.append(ch) // otherwise, accept the character
}

parseQuantifier(si, sb) // parse a quantifier, if present

ch = si.next()
} while (ch != null)

return Pattern.compile(sb.toString(), flags)
}

private fun parseCharacterClass(si: StringIterator, sb: StringBuilder) {
do {
val ch = si.next()
sb.append(ch)

when (ch) {
'&' -> {
if (si.peek() == '&') {
error(si, "'&&' is not supported in a character class")
}
}

'[' -> error(si, "'[' must be escaped within a character class")

'\\' -> {
when (val ch2 = si.next()) {
'[', ']', '\\' -> sb.append(ch2)
'd', 's', 'w', 'D', 'S', 'W' -> if (islVersion == IonSchemaVersion.v1_0) {
// For Ion Schema 1.0, this is an error because ISL 1.0 does
// not support pre-defined char classes (i.e., \d, \s, \w)
// while user is specifying a new char class
error(si, "invalid sequence '\\$ch2' in character class")
} else {
// In Ion Schema 2.0, this is allowed
sb.append(ch2)
}
else -> error(
si,
"invalid sequence '\\$ch2' in character class"
)
}
}

']' -> return
}
} while (ch != null)

error(si, "character class missing ']'")
}

private fun parseQuantifier(si: StringIterator, sb: StringBuilder) {
val initialLength = sb.length
var ch = si.peek()
when (ch) {
'?', '*', '+' -> {
ch = si.next()
sb.append(ch)
}
'{' -> {
ch = si.next()
sb.append(ch)
var complete = false
// A quantifier such as {,3} is not an ECMA 262 quantifier (it has no lower bound)
// We track whether we've found a number so that we can ensure that a comma is only
// allowed if it follows at least one digit.
var foundAnyNumber = false
do {
ch = si.next()
when (ch) {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9' -> { sb.append(ch); foundAnyNumber = true }
',' -> if (foundAnyNumber) sb.append(ch) else error(si, "range quantifier is missing lower bound")
'}' -> {
sb.append(ch)
complete = true
}
null -> {}
else -> error(si, "invalid character '$ch'")
}
} while (ch != null && !complete)

if (!complete) {
error(si, "range quantifier missing '}'")
}
}
}

if (sb.length > initialLength && ch != null) {
ch = si.peek()
when (ch) {
'?' -> error(si, "invalid character '$ch'")
'+' -> error(si, "invalid character '$ch'")
}
}
}

private fun error(si: StringIterator, message: String): Unit =
throw InvalidSchemaException("$message in regex '$si' at offset ${si.currentIndex()}")
}

private class StringIterator(private val s: String) {
private var index = -1
val length = s.length

fun next(): Char? {
index += 1
return get(index)
}

fun peek() = get(index + 1)

private fun get(i: Int): Char? {
if (i < length) {
return s[i]
}
return null
}

fun currentIndex() = index

override fun toString() = s
}
157 changes: 157 additions & 0 deletions ion-schema/src/main/kotlin/com/amazon/ionschema/internal/util/regex.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
package com.amazon.ionschema.internal.util

import com.amazon.ionschema.InvalidSchemaException
import com.amazon.ionschema.IonSchemaVersion

/**
* Validates that a regex pattern is valid for the subset of ECMA-262 that is supported by Ion Schema, and converts
* to the equivalent syntax for Java Regex.
*/
internal fun validateRegexPattern(string: String, islVersion: IonSchemaVersion = IonSchemaVersion.v2_0): String {
val si = StringIterator(string)
val sb = StringBuilder()
var ch = si.next()
do {
when (ch) {
'[' -> {
sb.append(ch)
parseCharacterClass(si, sb, islVersion)
}
'(' -> {
sb.append(ch)
ch = si.next()
if (ch == '?') { // error on "(?..." constructs
error(si, "invalid character '$ch'")
}
sb.append(ch)
}
'\\' -> { // handle escaped chars
ch = si.next()
when (ch) {
'.', '^', '$', '|', '?', '*', '+', '\\',
'[', ']', '(', ')', '{', '}',
'w', 'W', 'd', 'D' -> sb.append('\\').append(ch)
's' -> sb.append("[ \\f\\n\\r\\t]")
'S' -> sb.append("[^ \\f\\n\\r\\t]")
else -> error(si, "invalid escape character '$ch'")
}
}
else -> sb.append(ch) // otherwise, accept the character
}

parseQuantifier(si, sb) // parse a quantifier, if present

ch = si.next()
} while (ch != null)

return sb.toString() // Pattern.compile(sb.toString(), flags)
}

private fun parseCharacterClass(si: StringIterator, sb: StringBuilder, islVersion: IonSchemaVersion) {
do {
val ch = si.next()
sb.append(ch)

when (ch) {
'&' -> {
if (si.peek() == '&') {
error(si, "'&&' is not supported in a character class")
}
}

'[' -> error(si, "'[' must be escaped within a character class")

'\\' -> {
when (val ch2 = si.next()) {
'[', ']', '\\' -> sb.append(ch2)
'd', 's', 'w', 'D', 'S', 'W' -> if (islVersion == IonSchemaVersion.v1_0) {
// For Ion Schema 1.0, this is an error because ISL 1.0 does
// not support pre-defined char classes (i.e., \d, \s, \w)
// while user is specifying a new char class
error(si, "invalid sequence '\\$ch2' in character class")
} else {
// In Ion Schema 2.0, this is allowed
sb.append(ch2)
}
else -> error(
si,
"invalid sequence '\\$ch2' in character class"
)
}
}

']' -> return
}
} while (ch != null)

error(si, "character class missing ']'")
}
private fun parseQuantifier(si: StringIterator, sb: StringBuilder) {
val initialLength = sb.length
var ch = si.peek()
when (ch) {
'?', '*', '+' -> {
ch = si.next()
sb.append(ch)
}
'{' -> {
ch = si.next()
sb.append(ch)
var complete = false
// A quantifier such as {,3} is not an ECMA 262 quantifier (it has no lower bound)
// We track whether we've found a number so that we can ensure that a comma is only
// allowed if it follows at least one digit.
var foundAnyNumber = false
do {
ch = si.next()
when (ch) {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9' -> { sb.append(ch); foundAnyNumber = true }
',' -> if (foundAnyNumber) sb.append(ch) else error(si, "range quantifier is missing lower bound")
'}' -> {
sb.append(ch)
complete = true
}
null -> {}
else -> error(si, "invalid character '$ch'")
}
} while (ch != null && !complete)

if (!complete) {
error(si, "range quantifier missing '}'")
}
}
}

if (sb.length > initialLength && ch != null) {
ch = si.peek()
when (ch) {
'?' -> error(si, "invalid character '$ch'")
'+' -> error(si, "invalid character '$ch'")
}
}
}
private fun error(si: StringIterator, message: String): Unit =
throw InvalidSchemaException("$message in regex '$si' at offset ${si.currentIndex()}")

private class StringIterator(private val s: String) {
private var index = -1
val length = s.length

fun next(): Char? {
index += 1
return get(index)
}

fun peek() = get(index + 1)

private fun get(i: Int): Char? {
if (i < length) {
return s[i]
}
return null
}

fun currentIndex() = index

override fun toString() = s
}
Loading