forked from shendric/pyoval
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathappend_files.py
More file actions
124 lines (113 loc) · 5.51 KB
/
append_files.py
File metadata and controls
124 lines (113 loc) · 5.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding: utf-8 -*-
__version__= 1.0
__author__= "Justin Beckers, YorkU/UAlberta"
def append_files(outdir,custom="_concat",fileList,header=0,skiprows=0,sep=',',
dropna=0):
'''
Generic file appending. For a directory, find all files matching certain
file name characteristics and append the files. Files must be delimited.
Files may or may not have a header. If the header flag is set, then the
header from the first file is used for the output.
Author: Justin F. Beckers
Author Contact: beckers@ualberta.ca
Version: 1.0
Version Notes: Initial release
Release Date: March 15,2016
Inputs:
outdir: output directory
custom: custom string to add into filename. Default is "_concat"
fileList: list of files to append
header: Sets whether or not file has a header row or if it should be
read. Set to None if you don't want a header to be read, or if
the file has no header. Set to header row if the header is
present and you want it to be read. Default is None.
skiprows: rows to skip before the header
sep: file delimiter. Default is comma (','). use '\s+' for space
delimited files (*.dat).
dropna: if dropna is set to 1, drop rows that have any nans.
Outputs:
Outputs a delimited file with delimiter equal to input file delimiter,
name equal to the start of the name of the first file with the custom
string added. Output file has a header if the input header is specified
or used. Output file is put into outdir directory.
Usage Notes:
Case 1: Using Main
From commandline/terminal, can simply run:
>>>python append_files.py
This executes the program with the parameters set on
lines 110 - 124. Modify this section in the script file text to
meet your requirements
if __name__=='__main__':
Case 2: Import function:
import append_files
append_files(outdir,custom,files,header=0,skiprows=0,dropna=0,sep=',')
'''
#Load the required libraries
import os
import pandas as pd
import numpy as np
#Set the output filename
outname = os.path.join(outdir,os.path.basename(
fileList[0]).split('_')[0]+custom+'.csv')
#If files have no header (user specified) or is being ignored:
if header == None:
#Read the first file to establish the data record.
out = pd.read_table(fileList[0],sep=sep,header=None,skiprows=skiprows,
index_col=None,na_values=['nan','NaN',str(np.nan),np.nan])
if len(fileList)>1: #If the fileList has more than one file in it.
#For the rest of the filenames in the fileList
for x in np.arange(1,len(fileList)):
data=pd.read_table(fileList[x],sep=sep,header=None,
index_col=None,skiprows=skiprows,na_values=['nan','NaN',
str(np.nan),np.nan])
#Concatenate the data files
out=pd.concat([out,data],ignore_index=True,axis=0)
#If you want to remove rows where any value is a nan value.
if dropna==1:
out=out.dropna()
'''
If you need to do anything else to the data, i.e. calculate a new
column, sort the data, etc., this is the place to insert that code.
'''
#Write the output to a new file with the same delimiter as the input.
#Output file also has no header (like the input files)
out.to_csv(path_or_buf=outname,sep=sep,header=None,index=False,
mode='w')
else: #If the input files have a header (user-specified flag)
#Read the first file to establish the output data record
out = pd.read_table(fileList[0],sep=sep,header=header,index_col=None,
skiprows=skiprows,na_values=['nan','NaN',np.nan])
#Extract the header in case we need it again further down.
head=out.columns
if len(fileList)>1:#If fileList has more than one file.
#For the rest of the filenames in the fileList
for x in np.arange(1,len(fileList)):
data=pd.read_table(fileList[x],sep=sep,index_col=None,
na_values=['nan','NaN',np.nan],header=header,
skiprows=skiprows)
#Concatenate the data records.
out=pd.concat([out,data],ignore_index=True,axis=0)
#If you want to remove rows where any value is a nan value.
if dropna==1:
out=out.dropna()
'''
If you need to do anything else to the data, i.e. calculate a new
column, sort the data, etc., this is the place to insert that code.
'''
#Write the output to a new file with the same delimiter as the input.
#Output file also has a header (like the input files)
out.to_csv(path_or_buf=outname,sep=sep,na_rep='nan',header=True,
index=False,mode='w')
if __name__=='__main__':
import glob
import time #Used to roughly time the run time of the code.
#The setup.
indir='/Volumes/JustinBeckers_2TBPortable/CryoSat_Lakes/PARLSAR_CS2Data/'
files=glob.glob(indir+"*_positions.csv")
outdir='/Volumes/JustinBeckers_2TBPortable/CryoSat_Lakes/PARLSAR_CS2Data'
custom='concat'
#Run.
a=time.clock()
append_files(outdir,custom,files,header=0,skiprows=0,dropna=0,sep=',')
b=time.clock()
print "Time to run the script (s): ",b-a