-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathimportRdf.php
93 lines (76 loc) · 3.15 KB
/
importRdf.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
<?php
/**
* To the extent possible under law, I, Samuel Lampa, have waived all copyright and
* related or neighboring rights to Hello World. This work is published from Sweden.
*
* @copyright CC0 http://creativecommons.org/publicdomain/zero/1.0/
* @author Samuel Lampa <[email protected]>
* @ingroup Maintenance
*/
$basePath = getenv( 'MW_INSTALL_PATH' ) !== false ? getenv( 'MW_INSTALL_PATH' ) : __DIR__ . '/../../..';
require_once $basePath . '/maintenance/Maintenance.php';
class BatchImportRDF extends Maintenance {
public function __construct() {
parent::__construct();
// NTriples is required in order to split lines into chunks. Splitting RDF/XML or Turtle much harder.
$this->addOption( 'in', 'A file in with RDF data in NTriples format, with one triple per line.', true, true );
$this->addOption( 'chunksize', 'How many lines (triples) to import at a time. 0 means no chunking.', false, true );
$this->addOption( 'chunksleep', 'How many seconds (float value) to sleep after each chunk has been imported.', false, true );
$this->addOption( 'offset', 'Skip this many triples before starting import', false, true );
$this->addOption( 'verbose', 'Show verbose output', false, false, 'v' );
}
public function execute() {
$inFile = $this->getOption( 'in', '' );
$chunksize = intval( $this->getOption( 'chunksize', 0 ) );
$chunksleep = floatval( $this->getOption( 'chunksleep', 0.0 ) );
$offset = intval( $this->getOption( 'offset', 0 ) );
$verbose = $this->getOption( 'verbose', false );
$this->output( "Starting import from file: $inFile\n" );
if ( $offset > 0 ) {
$this->output( "Starting with offset $offset ...\n" );
}
$rdfImporter = new RDFIORDFImporter();
$inFileHandle = fopen( $inFile, 'r' );
$lineinchunk = 1;
$chunkindex = 1;
$lineindex = 0;
$totalimported = 0;
$importdata = '';
while ( $line = fgets( $inFileHandle ) ) {
if ( $lineindex >= $offset ) {
if ( $chunksize > 0 && $lineinchunk == 1 ) {
if ( $verbose ) {
$this->output( "Starting chunk $chunkindex ...\n" );
}
}
$importdata .= $line;
if ( $verbose ) {
$this->output( "Appended line $lineinchunk in chunk $chunkindex, to indata ...\n" );
}
$totalimported++;
if ( $chunksize != 0 && $lineinchunk == $chunksize ) {
$rdfImporter->importTurtle( $importdata );
$totalwithoffset = $totalimported + $offset;
$this->output( "Imported $chunksize triples in chunk $chunkindex ($totalimported triples imported in total, and $totalwithoffset including offset)!\n" );
// Reset variables
$lineinchunk = 0;
$importdata = '';
// Bump chunk index
$chunkindex++;
if ( $verbose ) {
$this->output( 'Now sleeping for ' . strval( $chunksleep ) . ' seconds before continuing with next chunk ...' );
}
sleep( $chunksleep );
}
$lineinchunk++;
}
$lineindex++;
}
// Import any remaining stuff, or all the stuff, if chunksize = 0
$rdfImporter->importTurtle( $importdata );
fclose( $inFileHandle );
$this->output( "Finished importing everything ($totalimported triples in total)!\n" );
}
}
$maintClass = 'BatchImportRDF';
require_once RUN_MAINTENANCE_IF_MAIN;