@@ -2481,14 +2481,13 @@ AK_DR_process_char(AK_DelimitedReader *dr, AK_CodePointGrid *cpg, Py_UCS4 c)
24812481 dr -> state = (c == '\0' ? START_RECORD : EAT_CRNL );
24822482 }
24832483 else if (c == dialect -> quotechar && dialect -> quoting != QUOTE_NONE ) {
2484- // start quoted field
24852484 dr -> state = IN_QUOTED_FIELD ;
24862485 }
2487- else if (c == dialect -> escapechar ) { // possible escaped character
2486+ else if (c == dialect -> escapechar ) {
24882487 dr -> state = ESCAPED_CHAR ;
24892488 }
24902489 else if (c == ' ' && dialect -> skipinitialspace );
2491- else if (c == dialect -> delimiter ) { // save empty field
2490+ else if (c == dialect -> delimiter ) { // end of a field
24922491 if (AK_DR_close_field (dr , cpg )) return -1 ;
24932492 }
24942493 else { // begin new unquoted field
@@ -2515,7 +2514,7 @@ AK_DR_process_char(AK_DelimitedReader *dr, AK_CodePointGrid *cpg, Py_UCS4 c)
25152514 if (AK_DR_close_field (dr , cpg )) return -1 ;
25162515 dr -> state = (c == '\0' ? START_RECORD : EAT_CRNL );
25172516 }
2518- else if (c == dialect -> escapechar ) { // possible escaped character
2517+ else if (c == dialect -> escapechar ) {
25192518 dr -> state = ESCAPED_CHAR ;
25202519 }
25212520 else if (c == dialect -> delimiter ) { // save field - wait for new field
@@ -2566,8 +2565,7 @@ AK_DR_process_char(AK_DelimitedReader *dr, AK_CodePointGrid *cpg, Py_UCS4 c)
25662565 }
25672566 else { // illegal
25682567 PyErr_Format (PyExc_RuntimeError , "'%c' expected after '%c'" ,
2569- dialect -> delimiter ,
2570- dialect -> quotechar );
2568+ dialect -> delimiter , dialect -> quotechar );
25712569 return -1 ;
25722570 }
25732571 break ;
@@ -2943,61 +2941,212 @@ iterable_str_to_array_1d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwarg
29432941 return AK_IterableStrToArray1D (iterable , dtype_specifier , tsep , decc );
29442942}
29452943
2944+ static char * split_after_count_kwarg_names [] = {
2945+ "string" ,
2946+ "delimiter" ,
2947+ "count" ,
2948+ "doublequote" ,
2949+ "escapechar" ,
2950+ "quotechar" ,
2951+ "quoting" ,
2952+ "strict" ,
2953+ NULL
2954+ };
29462955
29472956static PyObject *
2948- split_after_count (PyObject * Py_UNUSED (m ), PyObject * args )
2957+ split_after_count (PyObject * Py_UNUSED (m ), PyObject * args , PyObject * kwargs )
29492958{
29502959 PyObject * string = NULL ;
29512960 PyObject * delimiter = NULL ;
29522961 int count = 0 ;
2962+ PyObject * doublequote = NULL ;
2963+ PyObject * escapechar = NULL ;
2964+ PyObject * quotechar = NULL ;
2965+ PyObject * quoting = NULL ;
2966+ PyObject * strict = NULL ;
29532967
2954- if (!PyArg_ParseTuple (args ,
2955- "OOi:split_after_count" ,
2968+ if (!PyArg_ParseTupleAndKeywords (args , kwargs ,
2969+ "O|$OiOOOOO:split_after_count" ,
2970+ split_after_count_kwarg_names ,
29562971 & string ,
2972+ // kwarg-only
29572973 & delimiter ,
2958- & count )) {
2974+ & count ,
2975+ & doublequote ,
2976+ & escapechar ,
2977+ & quotechar ,
2978+ & quoting ,
2979+ & strict
2980+ )) {
29592981 return NULL ;
29602982 }
29612983
29622984 if (!PyUnicode_Check (string )) {
2963- PyErr_Format (PyExc_RuntimeError ,
2985+ PyErr_Format (PyExc_ValueError ,
29642986 "a string is required, not %.200s" ,
29652987 Py_TYPE (string )-> tp_name
29662988 );
29672989 return NULL ;
29682990 }
2969-
29702991 if (count <= 0 ) {
2971- PyErr_Format (PyExc_RuntimeError ,
2992+ PyErr_Format (PyExc_ValueError ,
29722993 "count must be greater than zero, not %i" ,
29732994 count
29742995 );
29752996 return NULL ;
29762997 }
29772998
2978- Py_UCS4 delim_char ;
2999+ AK_Dialect dialect ;
3000+
29793001 if (AK_set_char (
29803002 "delimiter" ,
2981- & delim_char ,
3003+ & dialect . delimiter ,
29823004 delimiter ,
2983- '\0' )) return NULL ;
3005+ ',' )) return NULL ;
3006+
3007+ if (AK_set_bool (
3008+ "doublequote" ,
3009+ & dialect .doublequote ,
3010+ doublequote ,
3011+ true)) return NULL ;
3012+
3013+ if (AK_set_char (
3014+ "escapechar" ,
3015+ & dialect .escapechar ,
3016+ escapechar ,
3017+ 0 )) return NULL ;
3018+
3019+ if (AK_set_char (
3020+ "quotechar" ,
3021+ & dialect .quotechar ,
3022+ quotechar ,
3023+ '"' )) return NULL ;
3024+
3025+ if (AK_set_int (
3026+ "quoting" ,
3027+ & dialect .quoting ,
3028+ quoting ,
3029+ QUOTE_MINIMAL )) return NULL ;
3030+
3031+ if (AK_set_bool (
3032+ "strict" ,
3033+ & dialect .strict ,
3034+ strict ,
3035+ false)) return NULL ;
29843036
29853037 unsigned int kind = PyUnicode_KIND (string );
29863038 const void * data = PyUnicode_DATA (string );
29873039 Py_ssize_t pos = 0 ;
29883040 Py_ssize_t delim_count = 0 ;
29893041 Py_ssize_t linelen = PyUnicode_GET_LENGTH (string );
29903042 Py_UCS4 c ;
3043+ AK_DelimitedReaderState state = START_RECORD ;
29913044
29923045 while (pos < linelen ) {
29933046 c = PyUnicode_READ (kind , data , pos );
2994- if (c == delim_char ) {
2995- delim_count ++ ;
2996- if (delim_count == count ) {
2997- break ; // to not include delim at transition
2998- // do not increment pos so as to exclude in left
3047+
3048+ switch (state ) {
3049+ case START_RECORD : // start of record
3050+ if (c == '\0' ) // empty line
3051+ break ;
3052+ else if (c == '\n' || c == '\r' ) {
3053+ state = EAT_CRNL ;
3054+ break ;
3055+ }
3056+ state = START_FIELD ; // normal character
3057+ // fallthru
3058+ case START_FIELD : // expecting field
3059+ if (c == '\n' || c == '\r' || c == '\0' ) {
3060+ state = (c == '\0' ? START_RECORD : EAT_CRNL );
3061+ }
3062+ else if (c == dialect .quotechar && dialect .quoting != QUOTE_NONE ) {
3063+ state = IN_QUOTED_FIELD ;
3064+ }
3065+ else if (c == dialect .escapechar ) {
3066+ state = ESCAPED_CHAR ;
3067+ }
3068+ else if (c == dialect .delimiter ) { // end of a field
3069+ delim_count += 1 ;
3070+ }
3071+ else {
3072+ state = IN_FIELD ;
3073+ }
3074+ break ;
3075+ case ESCAPED_CHAR :
3076+ if (c == '\n' || c == '\r' ) {
3077+ state = AFTER_ESCAPED_CRNL ;
3078+ break ;
3079+ }
3080+ if (c == '\0' )
3081+ c = '\n' ;
3082+ state = IN_FIELD ;
3083+ break ;
3084+ case AFTER_ESCAPED_CRNL :
3085+ if (c == '\0' ) break ;
3086+ // fallthru
3087+ case IN_FIELD : // in unquoted field
3088+ if (c == '\n' || c == '\r' || c == '\0' ) { // end of line
3089+ state = (c == '\0' ? START_RECORD : EAT_CRNL );
3090+ }
3091+ else if (c == dialect .escapechar ) {
3092+ state = ESCAPED_CHAR ;
3093+ }
3094+ else if (c == dialect .delimiter ) {
3095+ delim_count += 1 ;
3096+ state = START_FIELD ;
3097+ }
3098+ break ;
3099+ case IN_QUOTED_FIELD : // in quoted field
3100+ if (c == '\0' );
3101+ else if (c == dialect .escapechar ) {
3102+ state = ESCAPE_IN_QUOTED_FIELD ;
3103+ }
3104+ else if (c == dialect .quotechar && dialect .quoting != QUOTE_NONE ) {
3105+ state = (dialect .doublequote ? QUOTE_IN_QUOTED_FIELD : IN_FIELD );
29993106 }
3107+ break ;
3108+ case ESCAPE_IN_QUOTED_FIELD :
3109+ if (c == '\0' ) {
3110+ c = '\n' ;
3111+ }
3112+ state = IN_QUOTED_FIELD ;
3113+ break ;
3114+ case QUOTE_IN_QUOTED_FIELD :
3115+ // doublequote - seen a quote in a quoted field
3116+ if (dialect .quoting != QUOTE_NONE && c == dialect .quotechar ) {
3117+ state = IN_QUOTED_FIELD ;
3118+ }
3119+ else if (c == dialect .delimiter ) {
3120+ delim_count += 1 ;
3121+ state = START_FIELD ;
3122+ }
3123+ else if (c == '\n' || c == '\r' || c == '\0' ) {
3124+ state = (c == '\0' ? START_RECORD : EAT_CRNL );
3125+ }
3126+ else if (!dialect .strict ) {
3127+ state = IN_FIELD ;
3128+ }
3129+ else { // illegal
3130+ PyErr_Format (PyExc_RuntimeError , "'%c' expected after '%c'" ,
3131+ dialect .delimiter , dialect .quotechar );
3132+ return NULL ;
3133+ }
3134+ break ;
3135+ case EAT_CRNL :
3136+ if (c == '\n' || c == '\r' );
3137+ else if (c == '\0' )
3138+ state = START_RECORD ;
3139+ else {
3140+ PyErr_Format (PyExc_RuntimeError ,
3141+ "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?" );
3142+ return NULL ;
3143+ }
3144+ break ;
30003145 }
3146+ if (delim_count == count ) {
3147+ break ; // to not include delim at transition
3148+ }
3149+ // NOTE: must break before the increment when finding match
30013150 pos ++ ;
30023151 }
30033152
@@ -3010,7 +3159,7 @@ split_after_count(PyObject *Py_UNUSED(m), PyObject *args)
30103159}
30113160
30123161
3013-
3162+ // A fast counter of unsized iterators
30143163static PyObject *
30153164count_iteration (PyObject * Py_UNUSED (m ), PyObject * iterable )
30163165{
@@ -3883,7 +4032,10 @@ static PyMethodDef arraykit_methods[] = {
38834032 (PyCFunction )iterable_str_to_array_1d ,
38844033 METH_VARARGS | METH_KEYWORDS ,
38854034 NULL },
3886- {"split_after_count" , split_after_count , METH_VARARGS , NULL },
4035+ {"split_after_count" ,
4036+ (PyCFunction )split_after_count ,
4037+ METH_VARARGS | METH_KEYWORDS ,
4038+ NULL },
38874039 {"count_iteration" , count_iteration , METH_O , NULL },
38884040 {"isna_element" , isna_element , METH_O , NULL },
38894041 {"dtype_from_element" , dtype_from_element , METH_O , NULL },
0 commit comments