1
1
"""
2
- Provides support for ARC v1 files.
2
+ Provides support for ARC v1 files.
3
3
4
4
:copyright: (c) 2012 Internet Archive
5
5
"""
@@ -28,21 +28,21 @@ class ARCHeader(CaseInsensitiveDict):
28
28
* content_type
29
29
* length (length of the n/w doc in bytes)
30
30
31
- V2 header fields are
31
+ V2 header fields are
32
32
33
33
* url
34
34
* ip_address
35
35
* date (date of archival)
36
- * content_type
36
+ * content_type
37
37
* result_code (response code)
38
- * checksum
38
+ * checksum
39
39
* location
40
40
* offset (offset from beginning of file to recrod)
41
41
* filename (name of arc file)
42
42
* length (length of the n/w doc in bytes)
43
43
44
44
"""
45
- def __init__ (self , url = "" , ip_address = "" , date = "" , content_type = "" ,
45
+ def __init__ (self , url = "" , ip_address = "" , date = "" , content_type = "" ,
46
46
result_code = "" , checksum = "" , location = "" , offset = "" , filename = "" , length = "" , version = 2 ):
47
47
48
48
if isinstance (date , datetime .datetime ):
@@ -55,8 +55,8 @@ def __init__(self, url = "", ip_address = "", date = "", content_type = "",
55
55
56
56
self .version = version
57
57
58
- CaseInsensitiveDict .__init__ (self ,
59
- url = url ,
58
+ CaseInsensitiveDict .__init__ (self ,
59
+ url = url ,
60
60
ip_address = ip_address ,
61
61
date = date ,
62
62
content_type = content_type ,
@@ -66,10 +66,10 @@ def __init__(self, url = "", ip_address = "", date = "", content_type = "",
66
66
offset = offset ,
67
67
filename = filename ,
68
68
length = length )
69
-
69
+
70
70
def write_to (self , f , version = None ):
71
71
"""
72
- Writes out the arc header to the file like object `f`.
72
+ Writes out the arc header to the file like object `f`.
73
73
74
74
If the version field is 1, it writes out an arc v1 header,
75
75
otherwise (and this is default), it outputs a v2 header.
@@ -93,44 +93,44 @@ def write_to(self, f, version = None):
93
93
filename = self ['filename' ],
94
94
length = self ['length' ])
95
95
f .write (header )
96
-
96
+
97
97
98
98
@property
99
99
def url (self ):
100
100
return self ["url" ]
101
-
101
+
102
102
@property
103
103
def ip_address (self ):
104
104
return self ["ip_address" ]
105
-
105
+
106
106
@property
107
107
def date (self ):
108
108
return datetime .datetime .strptime (self ['date' ], "%Y%m%d%H%M%S" )
109
-
109
+
110
110
@property
111
111
def content_type (self ):
112
112
return self ["content_type" ]
113
-
113
+
114
114
@property
115
115
def result_code (self ):
116
116
return self ["result_code" ]
117
-
117
+
118
118
@property
119
119
def checksum (self ):
120
120
return self ["checksum" ]
121
-
121
+
122
122
@property
123
123
def location (self ):
124
124
return self ["location" ]
125
-
125
+
126
126
@property
127
127
def offset (self ):
128
128
return int (self ["offset" ])
129
-
129
+
130
130
@property
131
131
def filename (self ):
132
132
return self ["filename" ]
133
-
133
+
134
134
@property
135
135
def length (self ):
136
136
return int (self ["length" ])
@@ -139,7 +139,7 @@ def __str__(self):
139
139
f = io .StringIO ()
140
140
self .write_to (f )
141
141
return f .getvalue ()
142
-
142
+
143
143
def __repr__ (self ):
144
144
f = {}
145
145
for i in "url ip_address date content_typeresult_code checksum location offset filename length" .split ():
@@ -149,19 +149,19 @@ def __repr__(self):
149
149
s = ", " .join (s )
150
150
return "<ARCHeader(%s)>" % s
151
151
152
-
152
+
153
153
class ARCRecord (object ):
154
154
def __init__ (self , header = None , payload = None , headers = {}, version = None ):
155
155
if not (header or headers ):
156
156
raise TypeError ("Can't write create an ARC1 record without a header" )
157
157
self .header = header or ARCHeader (version = version , ** headers )
158
158
self .payload = payload
159
159
self .version = version
160
-
160
+
161
161
@classmethod
162
162
def from_string (cls , string , version ):
163
163
"""
164
- Constructs an ARC record from a string and returns it.
164
+ Constructs an ARC record from a string and returns it.
165
165
166
166
TODO: It might be best to merge this with the _read_arc_record
167
167
function rather than reimplement the functionality here.
@@ -199,20 +199,20 @@ def __getitem__(self, name):
199
199
def __setitem__ (self , name , value ):
200
200
self .header [name ] = value
201
201
202
-
202
+
203
203
def __str__ (self ):
204
204
f = io .StringIO ()
205
205
self .write_to (f )
206
206
return f .getvalue ()
207
-
208
-
207
+
208
+
209
209
class ARCFile (object ):
210
210
def __init__ (self , filename = None , mode = None , fileobj = None , version = None , file_headers = {}, compress = False ):
211
211
"""
212
212
Initialises a file like object that can be used to read or
213
213
write Arc files. Works for both version 1 or version 2.
214
214
215
- This can be called similar to the builtin `file` constructor.
215
+ This can be called similar to the builtin `file` constructor.
216
216
217
217
It can also just be given a fileobj which is a file like
218
218
object that it will use directly for its work.
@@ -224,7 +224,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_
224
224
225
225
* ip_address - IP address of the machine doing the Archiving
226
226
* date - Date of archival
227
- * org - Organisation that's doing the Archiving.
227
+ * org - Organisation that's doing the Archiving.
228
228
229
229
The version parameter tries to work intuitively as follows
230
230
@@ -248,7 +248,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_
248
248
* When we try to read a record, it will read out one
249
249
record and try to guess the version from it (for
250
250
the first read).
251
-
251
+
252
252
"""
253
253
if fileobj is None :
254
254
fileobj = builtins .open (filename , mode or "rb" )
@@ -259,7 +259,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_
259
259
260
260
if compress :
261
261
fileobj = gzip .open (fileobj , mode )
262
-
262
+
263
263
self .fileobj = fileobj
264
264
265
265
self .filename = filename
@@ -282,7 +282,7 @@ def __enter__(self):
282
282
283
283
def __exit__ (self , exc_type , exc_value , traceback ):
284
284
self .close ()
285
-
285
+
286
286
def _write_header (self ):
287
287
"Writes out an ARC header"
288
288
if "org" not in self .file_headers :
@@ -301,15 +301,15 @@ def _write_header(self):
301
301
payload = "2 0 %(org)s\n URL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length"
302
302
else :
303
303
raise IOError ("Can't write an ARC file with version '\" %s\" '" % self .version )
304
-
304
+
305
305
fname = os .path .basename (self .filename )
306
306
header = ARCHeader (url = "filedesc://%s" % fname ,
307
- ip_address = self .file_headers ['ip_address' ],
307
+ ip_address = self .file_headers ['ip_address' ],
308
308
date = self .file_headers ['date' ],
309
- content_type = "text/plain" ,
309
+ content_type = "text/plain" ,
310
310
length = len (payload ),
311
311
result_code = "200" ,
312
- checksum = "-" ,
312
+ checksum = "-" ,
313
313
location = "-" ,
314
314
offset = str (self .fileobj .tell ()),
315
315
filename = fname )
@@ -340,7 +340,7 @@ def _read_file_header(self):
340
340
# print "--------------------------------------------------"
341
341
if self .version and int (self .version ) != version :
342
342
raise IOError ("Version mismatch. Requested version was '%s' but version in file was '%s'" % (self .version , version ))
343
-
343
+
344
344
if version == '1' :
345
345
url , ip_address , date , content_type , length = header .split ()
346
346
self .file_headers = {"ip_address" : ip_address ,
@@ -404,13 +404,13 @@ def _read_arc_record(self):
404
404
self .fileobj .readline () # Munge the separator newline.
405
405
406
406
return ARCRecord (header = arc_header , payload = payload )
407
-
407
+
408
408
def read (self ):
409
409
"Reads out an arc record from the file"
410
410
if not self .header_read :
411
411
self ._read_file_header ()
412
412
return self ._read_arc_record ()
413
-
413
+
414
414
# For compatability with WARCFile
415
415
read_record = read
416
416
write_record = write
@@ -420,16 +420,16 @@ def __iter__(self):
420
420
while record :
421
421
yield record
422
422
record = self .read ()
423
-
423
+
424
424
def close (self ):
425
425
self .fileobj .close ()
426
-
427
-
428
-
429
-
430
-
431
-
432
-
433
-
434
-
435
-
426
+
427
+
428
+
429
+
430
+
431
+
432
+
433
+
434
+
435
+
0 commit comments