Skip to content

Commit 11c641f

Browse files
committed
Merge pull request #79 from mgckind/memory_input
Adding reading by chunks of memory
2 parents ac05171 + 6abdaac commit 11c641f

File tree

4 files changed

+385
-39
lines changed

4 files changed

+385
-39
lines changed

easyaccess/easyaccess.py

+51-33
Original file line numberDiff line numberDiff line change
@@ -1757,25 +1757,7 @@ def do_show_index(self, arg):
17571757
def complete_show_index(self, text, line, begidx, lastidx):
17581758
return self._complete_tables(text)
17591759

1760-
def get_filename(self, line):
1761-
# Good to move some of this into eautils.fileio
1762-
line = line.replace(';', '')
1763-
if line == "":
1764-
print('\nMust include table filename!\n')
1765-
return
1766-
if line.find('.') == -1:
1767-
print(colored('\nError in filename\n', "red"))
1768-
return
1769-
1770-
filename = "".join(line.split())
1771-
basename = os.path.basename(filename)
1772-
alls = basename.split('.')
1773-
if len(alls) > 2:
1774-
# Oracle tables cannot contain a '.'
1775-
print("\nDo not use extra '.' in filename\n")
1776-
return
17771760

1778-
return filename
17791761

17801762
def check_table_exists(self, table):
17811763
# check table first
@@ -1915,11 +1897,11 @@ def insert_data(self, table, columns, values, dtypes=None, niter = 0):
19151897
niter+1, len(values), len(columns), table.upper(), t2 - t1), "green"))
19161898

19171899

1918-
def do_load_table(self, line, name=None, chunksize=None):
1900+
def do_load_table(self, line, name=None, chunksize=None, memsize=None):
19191901
"""
19201902
DB:Loads a table from a file (csv or fits) taking name from filename and columns from header
19211903
1922-
Usage: load_table <filename> [--tablename NAME] [--chunksize CHUNK]
1904+
Usage: load_table <filename> [--tablename NAME] [--chunksize CHUNK] [--memsize MEMCHUNK]
19231905
Ex: example.csv has the following content
19241906
RA,DEC,MAG
19251907
1.23,0.13,23
@@ -1932,6 +1914,8 @@ def do_load_table(self, line, name=None, chunksize=None):
19321914
--tablename NAME given name for the table, default is taken from filename
19331915
--chunksize CHUNK Number of rows to be inserted at a time. Useful for large files
19341916
that do not fit in memory
1917+
--memsize MEMCHUNK The size in Mb to be read in chunks. If both specified, the lower
1918+
number of rows is selected (the lower memory limitations)
19351919
19361920
Note: - For csv or tab files, first line must have the column names (without # or any other comment) and same format
19371921
as data (using ',' or space)
@@ -1943,7 +1927,9 @@ def do_load_table(self, line, name=None, chunksize=None):
19431927
load_parser.add_argument('filename', help='name for the file', action='store', default=None)
19441928
load_parser.add_argument('--tablename', help='name for the table', action='store', default=None)
19451929
load_parser.add_argument('--chunksize', help='number of rows to read in blocks to avoid memory '
1946-
'issues', action='store', type=int, default=None)
1930+
'issues', action='store', type=int, default=None)
1931+
load_parser.add_argument('--memsize', help='size of the chunks to be read in Mb ',
1932+
action='store', type=int, default=None)
19471933
load_parser.add_argument('-h', '--help', help='print help', action='store_true')
19481934
try:
19491935
load_args = load_parser.parse_args(line.split())
@@ -1953,11 +1939,20 @@ def do_load_table(self, line, name=None, chunksize=None):
19531939
if load_args.help:
19541940
self.do_help('load_table')
19551941
return
1956-
filename = self.get_filename(load_args.filename)
1942+
filename = eafile.get_filename(load_args.filename)
19571943
table = load_args.tablename
19581944
chunk = load_args.chunksize
1945+
memchunk = load_args.memsize
19591946
if chunksize is not None:
19601947
chunk = chunksize
1948+
if memsize is not None:
1949+
memchunk = memsize
1950+
if memchunk is not None:
1951+
memchunk_rows = eafile.get_chunksize(filename, memory=memchunk)
1952+
if chunk is not None:
1953+
chunk = min(chunk, memchunk_rows)
1954+
else:
1955+
chunk = memchunk_rows
19611956
if filename is None: return
19621957
base, ext = os.path.splitext(os.path.basename(filename))
19631958

@@ -1978,7 +1973,7 @@ def do_load_table(self, line, name=None, chunksize=None):
19781973
return
19791974

19801975
try:
1981-
data, iterator = self.load_data(filename)
1976+
data, iterator = eafile.read_file(filename)
19821977
except:
19831978
print_exception()
19841979
return
@@ -2070,11 +2065,11 @@ def complete_load_table(self, text, line, start_idx, end_idx):
20702065
return _complete_path(line)
20712066

20722067

2073-
def do_append_table(self, line, name=None, chunksize=None):
2068+
def do_append_table(self, line, name=None, chunksize=None, memsize=None):
20742069
"""
20752070
DB:Appends a table from a file (csv or fits) taking name from filename and columns from header.
20762071
2077-
Usage: append_table <filename> [--tablename NAME] [--chunksize CHUNK]
2072+
Usage: append_table <filename> [--tablename NAME] [--chunksize CHUNK] [--memsize MEMCHUNK]
20782073
Ex: example.csv has the following content
20792074
RA,DEC,MAG
20802075
1.23,0.13,23
@@ -2087,7 +2082,9 @@ def do_append_table(self, line, name=None, chunksize=None):
20872082
20882083
--tablename NAME given name for the table, default is taken from filename
20892084
--chunksize CHUNK Number of rows to be inserted at a time. Useful for large files
2090-
that do not fit in memory
2085+
that do not fit in memory
2086+
--memsize MEMCHUNK The size in Mb to be read in chunks. If both specified, the lower
2087+
number of rows is selected (the lower memory limitations)
20912088
20922089
Note: - For csv or tab files, first line must have the column names (without # or any other comment) and same format
20932090
as data (using ',' or space)
@@ -2100,6 +2097,8 @@ def do_append_table(self, line, name=None, chunksize=None):
21002097
append_parser.add_argument('--tablename', help='name for the table to append to', action='store', default=None)
21012098
append_parser.add_argument('--chunksize', help='number of rows to read in blocks to avoid memory '
21022099
'issues', action='store', default=None, type=int)
2100+
append_parser.add_argument('--memsize', help='size of the chunks to be read in Mb ', action='store',
2101+
type=int, default=None)
21032102
append_parser.add_argument('-h', '--help', help='print help', action='store_true')
21042103
try:
21052104
append_args = append_parser.parse_args(line.split())
@@ -2109,11 +2108,21 @@ def do_append_table(self, line, name=None, chunksize=None):
21092108
if append_args.help:
21102109
self.do_help('append_table')
21112110
return
2112-
filename = self.get_filename(append_args.filename)
2111+
filename = eafile.get_filename(append_args.filename)
21132112
table = append_args.tablename
21142113
chunk = append_args.chunksize
2114+
memchunk = append_args.memsize
21152115
if chunksize is not None:
21162116
chunk = chunksize
2117+
if memsize is not None:
2118+
memchunk = memsize
2119+
if memchunk is not None:
2120+
memchunk_rows = eafile.get_chunksize(filename, memory=memchunk)
2121+
if chunk is not None:
2122+
chunk = min(chunk, memchunk_rows)
2123+
else:
2124+
chunk = memchunk_rows
2125+
21172126
if filename is None: return
21182127
base, ext = os.path.splitext(os.path.basename(filename))
21192128

@@ -2134,7 +2143,7 @@ def do_append_table(self, line, name=None, chunksize=None):
21342143
'\n DESDB ~> CREATE TABLE %s (COL1 TYPE1(SIZE), ..., COLN TYPEN(SIZE));\n' % table.upper())
21352144
return
21362145
try:
2137-
data, iterator = self.load_data(filename)
2146+
data, iterator = eafile.read_file(filename)
21382147
except:
21392148
print_exception()
21402149
return
@@ -2508,7 +2517,7 @@ def myquota(self):
25082517
"""
25092518
self.do_myquota('')
25102519

2511-
def load_table(self, table_file, name=None, chunksize=None):
2520+
def load_table(self, table_file, name=None, chunksize=None, memsize=None):
25122521
"""
25132522
Loads and create a table in the DB. If name is not passed, is taken from
25142523
the filename. Formats supported are 'fits', 'csv' and 'tab' files
@@ -2518,21 +2527,22 @@ def load_table(self, table_file, name=None, chunksize=None):
25182527
table_file : Filename to be uploaded as table (.csv, .fits, .tab)
25192528
name : Name of the table to be created
25202529
chunksize : Number of rows to upload at a time to avoid memory issues
2530+
memsize : Size of chunk to be read. In Mb. If both specified, the lower number of rows is selected
25212531
25222532
Returns:
25232533
--------
25242534
True if success otherwise False
25252535
25262536
"""
25272537
try:
2528-
self.do_load_table(table_file, name=name, chunksize=chunksize)
2538+
self.do_load_table(table_file, name=name, chunksize=chunksize, memsize=memsize)
25292539
return True
25302540
except:
25312541
# exception
25322542
return False
25332543

25342544

2535-
def append_table(self, table_file, name=None, chunksize=None):
2545+
def append_table(self, table_file, name=None, chunksize=None, memsize=None):
25362546
"""
25372547
Appends data to a table in the DB. If name is not passed, is taken from
25382548
the filename. Formats supported are 'fits', 'csv' and 'tab' files
@@ -2542,13 +2552,14 @@ def append_table(self, table_file, name=None, chunksize=None):
25422552
table_file : Filename to be uploaded as table (.csv, .fits, .tab)
25432553
name : Name of the table to be created
25442554
chunksize : Number of rows to upload at a time to avoid memory issues
2555+
memsize : Size of chunk to be read. In Mb. If both specified, the lower number of rows is selected
25452556
25462557
Returns:
25472558
--------
25482559
True if success otherwise False
25492560
"""
25502561
try:
2551-
self.do_append_table(table_file, name=name, chunksize=chunksize)
2562+
self.do_append_table(table_file, name=name, chunksize=chunksize, memsize=memsize)
25522563
return True
25532564
except:
25542565
return False
@@ -2664,7 +2675,10 @@ def initial_message(quiet=False, clear=True):
26642675
or --append_table")
26652676
parser.add_argument("--chunksize", dest='chunksize', type=int, default = None,
26662677
help="Number of rows to be inserted at a time. Useful for large files \
2667-
that do not fit in memory. Use with --load_table")
2678+
that do not fit in memory. Use with --load_table or --append_table")
2679+
parser.add_argument("--memsize", dest='memsize', type=int, default = None,
2680+
help=" Size of chunk to be read at a time in Mb. Use with --load_table or "
2681+
"--append_table")
26682682
parser.add_argument("-s", "--db",dest='db', #choices=[...]?
26692683
help="Override database name [dessci,desoper,destest]")
26702684
parser.add_argument("-q", "--quiet", action="store_true", dest='quiet',
@@ -2785,6 +2799,8 @@ def colored(line, color): return line
27852799
linein += ' --tablename ' + args.tablename
27862800
if args.chunksize is not None:
27872801
linein += ' --chunksize ' + str(args.chunksize)
2802+
if args.memsize is not None:
2803+
linein += ' --memsize ' + str(args.memsize)
27882804
cmdinterp.onecmd(linein)
27892805
os._exit(0)
27902806
elif args.appendtable is not None:
@@ -2795,6 +2811,8 @@ def colored(line, color): return line
27952811
linein += ' --tablename ' + args.tablename
27962812
if args.chunksize is not None:
27972813
linein += ' --chunksize ' + str(args.chunksize)
2814+
if args.memsize is not None:
2815+
linein += ' --memsize ' + str(args.memsize)
27982816
cmdinterp.onecmd(linein)
27992817
os._exit(0)
28002818
else:

easyaccess/eautils/fileio.py

+62
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@
2121
import eautils.dtypes as eatypes
2222
import version
2323

24+
try:
25+
from termcolor import colored
26+
except:
27+
def colored(line, color): return line
28+
2429
PANDAS_DEFS = ('comma separated text', 'space separated tex', 'HDF5 format')
2530
PANDAS_EXTS = ('.csv', '.tab', '.h5')
2631

@@ -31,6 +36,63 @@
3136
FILE_EXTS = PANDAS_EXTS + FITS_EXTS
3237

3338

39+
def get_filename(line):
40+
"""
41+
Return filename after checking it has the right structure (no extra periods)
42+
"""
43+
line = line.replace(';', '')
44+
if line == "":
45+
print('\nMust include table filename!\n')
46+
return
47+
if line.find('.') == -1:
48+
print(colored('\nError in filename\n', "red"))
49+
return
50+
51+
filename = "".join(line.split())
52+
basename = os.path.basename(filename)
53+
alls = basename.split('.')
54+
if len(alls) > 2:
55+
# Oracle tables cannot contain a '.'
56+
print("\nDo not use extra '.' in filename\n")
57+
return
58+
59+
return filename
60+
61+
62+
def get_chunksize(filename, memory=500):
63+
"""
64+
Get the approximate number of lines ot be read given memory constrains
65+
66+
Parameters:
67+
-----------
68+
filename : File name
69+
memory : Memory in MB to compute the approximate number of rows
70+
71+
Returns:
72+
--------
73+
The number of rows need to be read for each chunk of memory
74+
"""
75+
base, ext = os.path.splitext(filename)
76+
check_filetype(ext, FILE_EXTS)
77+
78+
if ext in PANDAS_EXTS:
79+
if ext == '.csv': sepa = ','
80+
elif ext == '.tab' : sepa = None
81+
elif ext == '.h5':
82+
return IOError('\nReading HDF5 files by chunks is not supported yet\n')
83+
temp = pd.read_csv(filename, sep=sepa, nrows=100)
84+
bytes_per_row = temp.memory_usage(index=True).sum()/100.
85+
del temp
86+
elif ext in FITS_EXTS:
87+
temp = fitsio.FITS(filename)
88+
temp_data = temp[1][0:100]
89+
bytes_per_row = temp_data.nbytes/100.
90+
temp.close()
91+
del temp_data
92+
93+
return int(memory*1024**2/bytes_per_row)
94+
95+
3496
def cutquery(query, length):
3597
"""
3698
Return query in a list of fixed sized character strings

0 commit comments

Comments
 (0)