Skip to content

Commit a8cee48

Browse files
authored
Merge pull request #874 from Kotlin/parsing-optimization
Parsing improvements
2 parents b3b1f64 + 2a90396 commit a8cee48

File tree

7 files changed

+1341
-172
lines changed

7 files changed

+1341
-172
lines changed

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ public interface GlobalParserOptions {
4141

4242
public data class ParserOptions(
4343
val locale: Locale? = null,
44+
// TODO, migrate to kotlinx.datetime.format.DateTimeFormat? https://github.com/Kotlin/dataframe/issues/876
4445
val dateTimeFormatter: DateTimeFormatter? = null,
4546
val dateTimePattern: String? = null,
4647
val nullStrings: Set<String>? = null,

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt

Lines changed: 249 additions & 86 deletions
Large diffs are not rendered by default.

core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt

Lines changed: 346 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,35 @@
11
package org.jetbrains.kotlinx.dataframe.api
22

3+
import io.kotest.matchers.should
34
import io.kotest.matchers.shouldBe
5+
import kotlinx.datetime.DateTimeUnit
46
import kotlinx.datetime.Instant
57
import kotlinx.datetime.LocalDate
68
import kotlinx.datetime.LocalDateTime
79
import kotlinx.datetime.LocalTime
810
import kotlinx.datetime.Month
11+
import kotlinx.datetime.format.DateTimeComponents
12+
import kotlinx.datetime.plus
13+
import kotlinx.datetime.toJavaInstant
14+
import kotlinx.datetime.toKotlinInstant
915
import org.jetbrains.kotlinx.dataframe.DataFrame
16+
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
17+
import org.jetbrains.kotlinx.dataframe.impl.catchSilent
1018
import org.jetbrains.kotlinx.dataframe.type
1119
import org.junit.Test
1220
import java.util.Locale
21+
import kotlin.random.Random
1322
import kotlin.reflect.typeOf
23+
import kotlin.time.Duration
1424
import kotlin.time.Duration.Companion.days
1525
import kotlin.time.Duration.Companion.hours
26+
import kotlin.time.Duration.Companion.microseconds
27+
import kotlin.time.Duration.Companion.milliseconds
1628
import kotlin.time.Duration.Companion.minutes
29+
import kotlin.time.Duration.Companion.nanoseconds
1730
import kotlin.time.Duration.Companion.seconds
31+
import java.time.Duration as JavaDuration
32+
import java.time.Instant as JavaInstant
1833

1934
class ParseTests {
2035
@Test
@@ -142,9 +157,340 @@ class ParseTests {
142157
columnOf("2022-01-23T04:29:40").parse().type shouldBe typeOf<LocalDateTime>()
143158
}
144159

160+
@Test
161+
fun `can parse instants`() {
162+
val instantParser = Parsers[typeOf<Instant>()]!!.applyOptions(null)
163+
val javaInstantParser = Parsers[typeOf<JavaInstant>()]!!.applyOptions(null)
164+
165+
// from the kotlinx-datetime tests, java instants treat leap seconds etc. like this
166+
fun parseInstantLikeJavaDoesOrNull(input: String): Instant? =
167+
catchSilent {
168+
DateTimeComponents.Formats.ISO_DATE_TIME_OFFSET.parseOrNull(input)?.apply {
169+
when {
170+
hour == 24 && minute == 0 && second == 0 && nanosecond == 0 -> {
171+
setDate(toLocalDate().plus(1, DateTimeUnit.DAY))
172+
hour = 0
173+
}
174+
175+
hour == 23 && minute == 59 && second == 60 -> second = 59
176+
}
177+
}?.toInstantUsingOffset()
178+
}
179+
180+
fun formatTwoDigits(i: Int) = if (i < 10) "0$i" else "$i"
181+
182+
for (hour in 23..25) {
183+
for (minute in listOf(0..5, 58..62).flatten()) {
184+
for (second in listOf(0..5, 58..62).flatten()) {
185+
val input = "2020-03-16T$hour:${formatTwoDigits(minute)}:${formatTwoDigits(second)}Z"
186+
187+
val myParserRes = instantParser(input) as Instant?
188+
val myJavaParserRes = javaInstantParser(input) as JavaInstant?
189+
val instantRes = catchSilent { Instant.parse(input) }
190+
val instantLikeJava = parseInstantLikeJavaDoesOrNull(input)
191+
val javaInstantRes = catchSilent { JavaInstant.parse(input) }
192+
193+
// our parser has a fallback mechanism built in, like this
194+
myParserRes shouldBe (instantRes ?: javaInstantRes?.toKotlinInstant())
195+
myParserRes shouldBe instantLikeJava
196+
197+
myJavaParserRes shouldBe javaInstantRes
198+
199+
myParserRes?.toJavaInstant() shouldBe instantLikeJava?.toJavaInstant()
200+
instantLikeJava?.toJavaInstant() shouldBe myJavaParserRes
201+
myJavaParserRes shouldBe javaInstantRes
202+
}
203+
}
204+
}
205+
}
206+
145207
@Test
146208
fun `parse duration`() {
147209
columnOf("1d 15m", "20h 35m 11s").parse() shouldBe
148210
columnOf(1.days + 15.minutes, 20.hours + 35.minutes + 11.seconds)
149211
}
212+
213+
@Test
214+
fun `can parse duration isoStrings`() {
215+
val durationParser = Parsers[typeOf<Duration>()]!!.applyOptions(null) as (String) -> Duration?
216+
val javaDurationParser = Parsers[typeOf<JavaDuration>()]!!.applyOptions(null) as (String) -> JavaDuration?
217+
218+
fun testSuccess(duration: Duration, vararg isoStrings: String) {
219+
isoStrings.first() shouldBe duration.toIsoString()
220+
for (isoString in isoStrings) {
221+
Duration.parse(isoString) shouldBe duration
222+
durationParser(isoString) shouldBe duration
223+
224+
javaDurationParser(isoString) shouldBe catchSilent { JavaDuration.parse(isoString) }
225+
}
226+
}
227+
228+
// zero
229+
testSuccess(Duration.ZERO, "PT0S", "P0D", "PT0H", "PT0M", "P0DT0H", "PT0H0M", "PT0H0S")
230+
231+
// single unit
232+
testSuccess(1.days, "PT24H", "P1D", "PT1440M", "PT86400S")
233+
testSuccess(1.hours, "PT1H")
234+
testSuccess(1.minutes, "PT1M")
235+
testSuccess(1.seconds, "PT1S")
236+
testSuccess(1.milliseconds, "PT0.001S")
237+
testSuccess(1.microseconds, "PT0.000001S")
238+
testSuccess(1.nanoseconds, "PT0.000000001S", "PT0.0000000009S")
239+
testSuccess(0.9.nanoseconds, "PT0.000000001S")
240+
241+
// rounded to zero
242+
testSuccess(0.1.nanoseconds, "PT0S")
243+
testSuccess(Duration.ZERO, "PT0S", "PT0.0000000004S")
244+
245+
// several units combined
246+
testSuccess(1.days + 1.minutes, "PT24H1M")
247+
testSuccess(1.days + 1.seconds, "PT24H0M1S")
248+
testSuccess(1.days + 1.milliseconds, "PT24H0M0.001S")
249+
testSuccess(1.hours + 30.minutes, "PT1H30M")
250+
testSuccess(1.hours + 500.milliseconds, "PT1H0M0.500S")
251+
testSuccess(2.minutes + 500.milliseconds, "PT2M0.500S")
252+
testSuccess(90_500.milliseconds, "PT1M30.500S")
253+
254+
// with sign
255+
testSuccess(-1.days + 15.minutes, "-PT23H45M", "PT-23H-45M", "+PT-24H+15M")
256+
testSuccess(-1.days - 15.minutes, "-PT24H15M", "PT-24H-15M", "-PT25H-45M")
257+
testSuccess(Duration.ZERO, "PT0S", "P1DT-24H", "+PT-1H+60M", "-PT1M-60S")
258+
259+
// infinite
260+
testSuccess(
261+
Duration.INFINITE,
262+
"PT9999999999999H",
263+
"PT+10000000000000H",
264+
"-PT-9999999999999H",
265+
"-PT-1234567890123456789012S",
266+
)
267+
testSuccess(-Duration.INFINITE, "-PT9999999999999H", "-PT10000000000000H", "PT-1234567890123456789012S")
268+
269+
fun testFailure(isoString: String) {
270+
catchSilent { Duration.parse(isoString) } shouldBe durationParser(isoString)
271+
catchSilent { JavaDuration.parse(isoString) } shouldBe javaDurationParser(isoString)
272+
}
273+
274+
listOf(
275+
"",
276+
" ",
277+
"P",
278+
"PT",
279+
"P1DT",
280+
"P1",
281+
"PT1",
282+
"0",
283+
"+P",
284+
"+",
285+
"-",
286+
"h",
287+
"H",
288+
"something",
289+
"1m",
290+
"1d",
291+
"2d 11s",
292+
"Infinity",
293+
"-Infinity", // successful in kotlin, not in java
294+
"P+12+34D",
295+
"P12-34D",
296+
"PT1234567890-1234567890S",
297+
" P1D",
298+
"PT1S ",
299+
"P3W",
300+
"P1Y",
301+
"P1M",
302+
"P1S",
303+
"PT1D",
304+
"PT1Y",
305+
"PT1S2S",
306+
"PT1S2H",
307+
"P9999999999999DT-9999999999999H",
308+
"PT1.5H",
309+
"PT0.5D",
310+
"PT.5S",
311+
"PT0.25.25S",
312+
).forEach(::testFailure)
313+
}
314+
315+
@Test
316+
fun `can parse duration default kotlin strings`() {
317+
val durationParser = Parsers[typeOf<Duration>()]!!.applyOptions(null) as (String) -> Duration?
318+
319+
fun testParsing(string: String, expectedDuration: Duration) {
320+
Duration.parse(string) shouldBe expectedDuration
321+
durationParser(string) shouldBe expectedDuration
322+
}
323+
324+
fun testSuccess(duration: Duration, vararg expected: String) {
325+
val actual = duration.toString()
326+
actual shouldBe expected.first()
327+
328+
if (duration.isPositive()) {
329+
if (' ' in actual) {
330+
(-duration).toString() shouldBe "-($actual)"
331+
} else {
332+
(-duration).toString() shouldBe "-$actual"
333+
}
334+
}
335+
336+
for (string in expected) {
337+
testParsing(string, duration)
338+
if (duration.isPositive() && duration.isFinite()) {
339+
testParsing("+($string)", duration)
340+
testParsing("-($string)", -duration)
341+
if (' ' !in string) {
342+
testParsing("+$string", duration)
343+
testParsing("-$string", -duration)
344+
}
345+
}
346+
}
347+
}
348+
349+
testSuccess(101.days, "101d", "2424h")
350+
testSuccess(45.3.days, "45d 7h 12m", "45.3d", "45d 7.2h") // 0.3d == 7.2h
351+
testSuccess(45.days, "45d")
352+
353+
testSuccess(40.5.days, "40d 12h", "40.5d", "40d 720m")
354+
testSuccess(40.days + 20.minutes, "40d 0h 20m", "40d 20m", "40d 1200s")
355+
testSuccess(40.days + 20.seconds, "40d 0h 0m 20s", "40d 20s")
356+
testSuccess(40.days + 100.nanoseconds, "40d 0h 0m 0.000000100s", "40d 100ns")
357+
358+
testSuccess(40.hours + 15.minutes, "1d 16h 15m", "40h 15m")
359+
testSuccess(40.hours, "1d 16h", "40h")
360+
361+
testSuccess(12.5.hours, "12h 30m")
362+
testSuccess(12.hours + 15.seconds, "12h 0m 15s")
363+
testSuccess(12.hours + 1.nanoseconds, "12h 0m 0.000000001s")
364+
testSuccess(30.minutes, "30m")
365+
testSuccess(17.5.minutes, "17m 30s")
366+
367+
testSuccess(16.5.minutes, "16m 30s")
368+
testSuccess(1097.1.seconds, "18m 17.1s")
369+
testSuccess(90.36.seconds, "1m 30.36s")
370+
testSuccess(50.seconds, "50s")
371+
testSuccess(1.3.seconds, "1.3s")
372+
testSuccess(1.seconds, "1s")
373+
374+
testSuccess(0.5.seconds, "500ms")
375+
testSuccess(40.2.milliseconds, "40.2ms")
376+
testSuccess(4.225.milliseconds, "4.225ms")
377+
testSuccess(4.24501.milliseconds, "4.245010ms", "4ms 245us 10ns")
378+
testSuccess(1.milliseconds, "1ms")
379+
380+
testSuccess(0.75.milliseconds, "750us")
381+
testSuccess(75.35.microseconds, "75.35us")
382+
testSuccess(7.25.microseconds, "7.25us")
383+
testSuccess(1.035.microseconds, "1.035us")
384+
testSuccess(1.005.microseconds, "1.005us")
385+
testSuccess(1800.nanoseconds, "1.8us", "1800ns", "0.0000000005h")
386+
387+
testSuccess(950.5.nanoseconds, "951ns")
388+
testSuccess(85.23.nanoseconds, "85ns")
389+
testSuccess(8.235.nanoseconds, "8ns")
390+
testSuccess(1.nanoseconds, "1ns", "0.9ns", "0.001us", "0.0009us")
391+
testSuccess(1.3.nanoseconds, "1ns")
392+
testSuccess(0.75.nanoseconds, "1ns")
393+
testSuccess(0.7512.nanoseconds, "1ns")
394+
395+
// equal to zero
396+
// testSuccess(0.023.nanoseconds, "0.023ns")
397+
// testSuccess(0.0034.nanoseconds, "0.0034ns")
398+
// testSuccess(0.0000035.nanoseconds, "0.0000035ns")
399+
400+
testSuccess(Duration.ZERO, "0s", "0.4ns", "0000.0000ns")
401+
testSuccess(365.days * 10000, "3650000d")
402+
testSuccess(300.days * 100000, "30000000d")
403+
testSuccess(365.days * 100000, "36500000d")
404+
testSuccess(((Long.MAX_VALUE / 2) - 1).milliseconds, "53375995583d 15h 36m 27.902s") // max finite value
405+
406+
// all infinite
407+
// val universeAge = Duration.days(365.25) * 13.799e9
408+
// val planckTime = Duration.seconds(5.4e-44)
409+
410+
// testSuccess(universeAge, "5.04e+12d")
411+
// testSuccess(planckTime, "5.40e-44s")
412+
// testSuccess(Duration.nanoseconds(Double.MAX_VALUE), "2.08e+294d")
413+
testSuccess(Duration.INFINITE, "Infinity", "53375995583d 20h", "+Infinity")
414+
testSuccess(-Duration.INFINITE, "-Infinity", "-(53375995583d 20h)")
415+
416+
fun testFailure(isoString: String) {
417+
catchSilent { Duration.parse(isoString) } shouldBe durationParser(isoString)
418+
}
419+
420+
listOf(
421+
"",
422+
" ",
423+
"P",
424+
"PT",
425+
"P1DT",
426+
"P1",
427+
"PT1",
428+
"0",
429+
"+P",
430+
"+",
431+
"-",
432+
"h",
433+
"H",
434+
"something",
435+
"1234567890123456789012ns",
436+
"Inf",
437+
"-Infinity value",
438+
"1s ",
439+
" 1s",
440+
"1d 1m 1h",
441+
"1s 2s",
442+
"-12m 15s",
443+
"-12m -15s",
444+
"-()",
445+
"-(12m 30s",
446+
"+12m 15s",
447+
"+12m +15s",
448+
"+()",
449+
"+(12m 30s",
450+
"()",
451+
"(12m 30s)",
452+
"12.5m 11.5s",
453+
".2s",
454+
"0.1553.39m",
455+
"P+12+34D",
456+
"P12-34D",
457+
"PT1234567890-1234567890S",
458+
" P1D",
459+
"PT1S ",
460+
"P1Y",
461+
"P1M",
462+
"P1S",
463+
"PT1D",
464+
"PT1Y",
465+
"PT1S2S",
466+
"PT1S2H",
467+
"P9999999999999DT-9999999999999H",
468+
"PT1.5H",
469+
"PT0.5D",
470+
"PT.5S",
471+
"PT0.25.25S",
472+
).forEach(::testFailure)
473+
}
474+
475+
@Test
476+
fun `Parse normal string column`() {
477+
val df = dataFrameOf(List(5_000) { "_$it" }).fill(100) {
478+
Random.nextInt().toChar().toString() + Random.nextInt().toChar()
479+
}
480+
481+
df.parse()
482+
}
483+
484+
/**
485+
* Asserts that all elements of the iterable are equal to each other
486+
*/
487+
private fun <T> Iterable<T>.shouldAllBeEqual(): Iterable<T> {
488+
this should {
489+
it.reduce { a, b ->
490+
a shouldBe b
491+
b
492+
}
493+
}
494+
return this
495+
}
150496
}

0 commit comments

Comments
 (0)