|  | 
| 1 | 1 | import re | 
| 2 | 2 | import csv | 
|  | 3 | +import logging | 
| 3 | 4 | 
 | 
| 4 |  | -from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError | 
| 5 | 5 | from mfr.extensions.tabular import utilities | 
|  | 6 | +from mfr.extensions.tabular.settings import MAX_FILE_SIZE, TABULAR_INIT_SNIFF_SIZE | 
|  | 7 | +from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError | 
|  | 8 | + | 
|  | 9 | +logger = logging.getLogger(__name__) | 
| 6 | 10 | 
 | 
| 7 | 11 | 
 | 
| 8 | 12 | def csv_stdlib(fp): | 
| 9 | 13 |     """Read and convert a csv file to JSON format using the python standard library | 
| 10 |  | -    :param fp: File pointer object | 
| 11 |  | -    :return: tuple of table headers and data | 
|  | 14 | +
 | 
|  | 15 | +    Quirk: ``csv.Sniffer().sniff()`` needs the FULL first row and ONLY one full row to be able to | 
|  | 16 | +    effectively detect the correct dialect of the file. | 
|  | 17 | +
 | 
|  | 18 | +    :param fp: the file pointer object | 
|  | 19 | +    :return: a tuple of table headers and data | 
| 12 | 20 |     """ | 
| 13 |  | -    data = fp.read(2048) | 
|  | 21 | + | 
|  | 22 | +    logger.info('>>> ??? &&& ~~~ current settings') | 
|  | 23 | +    logger.info('>>> ??? &&& ~~~ max rendering file size = {}'.format(MAX_FILE_SIZE)) | 
|  | 24 | +    logger.info('>>> ??? &&& ~~~ initial sniffing size = {}'.format(TABULAR_INIT_SNIFF_SIZE)) | 
|  | 25 | + | 
|  | 26 | +    # Prepare the first row for sniffing | 
|  | 27 | +    data = fp.read(TABULAR_INIT_SNIFF_SIZE) | 
|  | 28 | +    data = _trim_or_append_data(fp, data, TABULAR_INIT_SNIFF_SIZE, 0) | 
|  | 29 | + | 
|  | 30 | +    # Reset the file pointer | 
| 14 | 31 |     fp.seek(0) | 
| 15 | 32 | 
 | 
|  | 33 | +    # Sniff the first row to find a matching format | 
| 16 | 34 |     try: | 
| 17 | 35 |         dialect = csv.Sniffer().sniff(data) | 
| 18 | 36 |     except csv.Error: | 
| 19 | 37 |         dialect = csv.excel | 
| 20 | 38 |     else: | 
| 21 | 39 |         _set_dialect_quote_attrs(dialect, data) | 
| 22 | 40 | 
 | 
|  | 41 | +    # Explicitly delete data when it is on longer used. | 
| 23 | 42 |     del data | 
|  | 43 | + | 
|  | 44 | +    logger.info('>>> ??? &&& ~~~ dialect delimiter detected = {}'.format(dialect.delimiter)) | 
|  | 45 | +    # Create the CSV reader with the detected dialect | 
| 24 | 46 |     reader = csv.DictReader(fp, dialect=dialect) | 
|  | 47 | + | 
|  | 48 | +    # Update the reader field names to avoid duplicate column names when performing row extraction | 
| 25 | 49 |     columns = [] | 
| 26 |  | -    # update the reader field names to avoid duplicate column names when performing row extraction | 
| 27 | 50 |     for idx, fieldname in enumerate(reader.fieldnames or []): | 
| 28 | 51 |         column_count = sum(1 for column in columns if fieldname == column['name']) | 
| 29 | 52 |         if column_count: | 
| @@ -92,3 +115,62 @@ def _set_dialect_quote_attrs(dialect, data): | 
| 92 | 115 |             dialect.quotechar = '"' | 
| 93 | 116 |         if re.search('"""[[({]\'.+\',', data): | 
| 94 | 117 |             dialect.doublequote = True | 
|  | 118 | + | 
|  | 119 | + | 
|  | 120 | +def _trim_or_append_data(fp, text, read_size, sniff_size): | 
|  | 121 | +    """Recursively read data from a file and return its first row. The file starts with ``text`` | 
|  | 122 | +    and the file pointer points to the next character immediately after `text`. | 
|  | 123 | +
 | 
|  | 124 | +    :param fp: the file pointer from which data is read | 
|  | 125 | +    :param text: the current text chunk to check the new line character | 
|  | 126 | +    :param read_size: the last read size when `fp.read()` is called | 
|  | 127 | +    :param sniff_size: the accumulated size fo the text to sniff | 
|  | 128 | +    :return: the first row of the file in string | 
|  | 129 | +    """ | 
|  | 130 | + | 
|  | 131 | +    logger.info('>>> ??? &&& ~~~ _trim_or_append_data() ...') | 
|  | 132 | +    logger.info('>>> ??? &&& ~~~ len(text)={}\tread_size={}\tsniff_size={}' | 
|  | 133 | +                .format(len(text), read_size, sniff_size)) | 
|  | 134 | + | 
|  | 135 | +    # Try to find the first new line character in the text chunk | 
|  | 136 | +    index = _find_new_line(text) | 
|  | 137 | +    # If found, return the trimmed substring | 
|  | 138 | +    if index != -1: | 
|  | 139 | +        logger.info('>>> ??? &&& ~~~ new line found @ index = {}, ' | 
|  | 140 | +                    'return the trimmed text'.format(index)) | 
|  | 141 | +        return text[:index] | 
|  | 142 | +    # Otherwise, update `sniff_size` and then sniff more (2 times of the last `read_size`) text | 
|  | 143 | +    sniff_size += read_size | 
|  | 144 | +    read_size *= 2 | 
|  | 145 | +    more_text = fp.read(read_size) | 
|  | 146 | + | 
|  | 147 | +    # If text to sniff now goes over the max file size limit, raise the renderer error since there | 
|  | 148 | +    # is no need to sniff when the file is already too large to be rendered. | 
|  | 149 | +    if sniff_size + len(more_text) >= MAX_FILE_SIZE: | 
|  | 150 | +        raise TabularRendererError( | 
|  | 151 | +            'The first row of this file is too large for the sniffer to detect the dialect. ' | 
|  | 152 | +            'Please download and view it locally.', | 
|  | 153 | +            code=400, | 
|  | 154 | +            extension='csv' | 
|  | 155 | +        ) | 
|  | 156 | +    # If the size is still within the limit, recursively check `more_text` | 
|  | 157 | +    logger.info('>>> ??? &&& ~~~ sniff more text') | 
|  | 158 | +    return text + _trim_or_append_data(fp, more_text, read_size, sniff_size) | 
|  | 159 | + | 
|  | 160 | + | 
|  | 161 | +def _find_new_line(text): | 
|  | 162 | +    """Check the text string for any type of new line character. | 
|  | 163 | +
 | 
|  | 164 | +    :param text: the text string to check | 
|  | 165 | +    :return: the index of the new line character if found. Otherwise, return -1. | 
|  | 166 | +    """ | 
|  | 167 | + | 
|  | 168 | +    index = text.rfind('\r\n') | 
|  | 169 | +    if index == -1: | 
|  | 170 | +        index = text.rfind('\n') | 
|  | 171 | +        if index == -1: | 
|  | 172 | +            index = text.rfind('\r') | 
|  | 173 | + | 
|  | 174 | +    logger.info('>>> ??? &&& ~~~ new line index = {}'.format(index)) | 
|  | 175 | + | 
|  | 176 | +    return index | 
0 commit comments