|
| 1 | +#!/usr/sh |
| 2 | +#=============================================================================== |
| 3 | +# |
| 4 | +# FILE: command-line-tools-for-encoding.sh |
| 5 | +# |
| 6 | +# USAGE: cat command-line-tools-for-encoding.sh |
| 7 | +# |
| 8 | +# DESCRIPTION: EticaAI/HXL-Data-Science-file-formats/guides/command-line-tools-for-encoding.sh |
| 9 | +# is an quick overview of different command line tools that |
| 10 | +# worth at least mention, in special if are dealing with raw |
| 11 | +# formats already not HXLated AND do not have UTF-8 encoding. |
| 12 | +# |
| 13 | +# |
| 14 | +# OPTIONS: --- |
| 15 | +# |
| 16 | +# REQUIREMENTS: 1. python3 |
| 17 | +# 2. pip |
| 18 | +# |
| 19 | +# BUGS: --- |
| 20 | +# NOTES: This guide is tested on Ubuntu 20.04. |
| 21 | +# - Most tools here are availible on Linux/Mac/Windows+WSL, |
| 22 | +# but you may need to change package names when installing. |
| 23 | +# - Consider read the source documentation for how to install |
| 24 | +# on other systems. |
| 25 | +# AUTHOR: Emerson Rocha <rocha[at]ieee.org> |
| 26 | +# COMPANY: EticaAI |
| 27 | +# LICENSE: Public Domain dedication |
| 28 | +# SPDX-License-Identifier: Unlicense |
| 29 | +# VERSION: v1.0 |
| 30 | +# CREATED: 2021-02-02 17:01 UTC |
| 31 | +# REVISION: --- |
| 32 | +#=============================================================================== |
| 33 | +echo "cat command-line-tools-for-encoding.sh" |
| 34 | +exit 1 |
| 35 | + |
| 36 | + |
| 37 | +#### Encoding __________________________________________________________________ |
| 38 | + |
| 39 | +### uchardet ------------------------------------------------------------------- |
| 40 | +# " uchardet is an encoding detector library, which takes a sequence of bytes in |
| 41 | +# an unknown character encoding without any additional information, and attempts |
| 42 | +# to determine the encoding of the text. Returned encoding names are |
| 43 | +# iconv-compatible." |
| 44 | +# @see https://www.freedesktop.org/wiki/Software/uchardet/ |
| 45 | +sudo apt install uchardet |
| 46 | + |
| 47 | +# fititnt@bravo:/workspace/data/brasil_inep_microdados-enem-2019/DADOS$ uchardet MICRODADOS_ENEM_2019.csv |
| 48 | +# ISO-8859-1 |
| 49 | +# fititnt@bravo:/workspace/data/brasil_inep_microdados-enem-2019/DADOS$ file MICRODADOS_ENEM_2019.csv |
| 50 | +# MICRODADOS_ENEM_2019.csv: ISO-8859 text, with very long lines |
| 51 | + |
| 52 | +### iconv (convert encoding) --------------------------------------------------- |
| 53 | +# @see https://stackoverflow.com/questions/64860/best-way-to-convert-text-files-between-character-sets/64889#64889 |
| 54 | +# 'uchardet' can be used to detect more exact encodign than the 'file'. Using as |
| 55 | +# example ISO-8859-1 |
| 56 | + |
| 57 | +iconv --from-code=ISO-8859-1 --to-code UTF-8 file.csv > file_utf8.csv |
| 58 | +# iconv --from-code=ISO-8859-1 --to-code UTF-8 MICRODADOS_ENEM_2019.csv > MICRODADOS_ENEM_2019_utf8.csv |
| 59 | +# csvformat --out-delimiter=, MICRODADOS_ENEM_2019_utf8.csv > MICRODADOS_ENEM_2019_utf8__csvformat.csv |
0 commit comments