diff --git a/htroot/ConfigParser_p.html b/htroot/ConfigParser_p.html index a51ee1013..66a4665d3 100644 --- a/htroot/ConfigParser_p.html +++ b/htroot/ConfigParser_p.html @@ -51,27 +51,6 @@

Parser Configuration

-
PDF Parser Attributes -

- This is an experimental setting which makes it possible to split PDF documents into individual index entries. - Every page will become a single index hit and the url is artifically extended with a post/get attribute value containing - the page number as value. When such an url is displayed within a search result, then the post/get attribute is transformed into an anchor hash link. - This makes it possible to view the individual page directly in the pdf.js viewer built-in into firefox, - for reference see https://github.com/mozilla/pdf.js/wiki/Viewer-options -

- - - - - - - - - - - - -
Split PDF
Property Name
#%env/templates/footer.template%# diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index 79a0319c0..3b328a996 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -134,7 +134,7 @@

Crawler

Speed / PPM
(Pages Per Minute) - + @@ -147,7 +147,7 @@

Crawler

Crawler PPM     - + diff --git a/ivy.xml b/ivy.xml index 61f9ee127..8c072699d 100644 --- a/ivy.xml +++ b/ivy.xml @@ -28,6 +28,7 @@ + diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index a3404bec0..1cac0dace 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -2593,14 +2593,18 @@ public boolean exists(final ClientIdentification.Agent agent) { return client.fileSize(path) > 0; } if (isHTTP() || isHTTPS()) { - try (final HTTPClient client = new HTTPClient(agent)) { - client.setHost(getHost()); - org.apache.http.HttpResponse response = client.HEADResponse(this, true); - return response != null && (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 301); - } + final HTTPClient client = new HTTPClient(agent); + client.setHost(getHost()); + org.apache.http.HttpResponse response = client.HEADResponse(this, true); + client.close(); + if (response == null) return false; + int status = response.getStatusLine().getStatusCode(); + return status == 200 || status == 301 || status == 302; } return false; } catch (IOException e) { + if (e.getMessage().contains("Circular redirect to")) return true; // exception; this is a 302 which the client actually accepts + //e.printStackTrace(); return false; } } diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java index 26f36f787..118e27e40 100644 --- a/source/net/yacy/document/importer/ZimImporter.java +++ b/source/net/yacy/document/importer/ZimImporter.java @@ -25,12 +25,20 @@ import java.io.File; import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; import java.util.Collection; +import java.util.Date; +import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.Map; +import java.util.Set; import java.util.TreeMap; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.ConcurrentLog; @@ -81,14 +89,18 @@ public ZimImporter(String path) throws IOException { public void run() { job = this; this.startTime = System.currentTimeMillis(); + Switchboard sb = Switchboard.getSwitchboard(); try { this.reader = new ZIMReader(this.file); this.guessedSource = getSource(this.reader); // verify the source DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry(); - DigestURL url = new DigestURL(mainEntry.url); - if (!url.exists(ClientIdentification.browserAgent)) return; + DigestURL mainURL = guessURL(this.guessedSource, mainEntry); + if (!mainURL.exists(ClientIdentification.browserAgent)) { + sb.log.info("zim importer: file " + this.file.getName() + " failed main url existence test: " + mainURL); + return; + } // read all documents for (int i = 0; i < this.file.header_entryCount; i++) { @@ -98,8 +110,14 @@ public void run() { ArticleEntry ae = (ArticleEntry) de; // check url - String guessedUrl = guessURL(this.guessedSource, de); - assert guessedUrl.startsWith("http"); + DigestURL guessedUrl = guessURL(this.guessedSource, de); + if (recordCnt < 10) { + // critical test for the first 10 urls + if (!guessedUrl.exists(ClientIdentification.browserAgent)) { + sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl); + return; + } + } // check availability of text parser String mimeType = ae.getMimeType(); @@ -111,7 +129,17 @@ public void run() { // create artificial request and response headers for the indexer RequestHeader requestHeader = new RequestHeader(); ResponseHeader responseHeader = new ResponseHeader(200); - final Request request = new Request(new DigestURL(guessedUrl), null); + responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content + final Request request = new Request( + ASCII.getBytes(sb.peers.mySeed().hash), + guessedUrl, + null, // referrerhash the hash of the referrer URL + de.title, // name the name of the document to crawl + null, // appdate the time when the url was first time appeared + sb.crawler.defaultSurrogateProfile.handle(), // profileHandle the name of the prefetch profile. This must not be null! + 0, // depth the crawling depth of the entry + sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset + ); final Response response = new Response( request, requestHeader, @@ -122,7 +150,7 @@ public void run() { ); // throw this to the indexer - String error = Switchboard.getSwitchboard().toIndexer(response); + String error = sb.toIndexer(response); if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error); this.recordCnt++; } @@ -203,7 +231,7 @@ public static String guessDomainName(String fileName) { case "fonts": return "fonts.google.com"; case "gutenberg": - return "gutenberg.org"; + return "https://dev.library.kiwix.org/viewer#gutenberg_de_all_2023-03"; case "ifixit": return "ifixit.com"; case "lesfondamentaux": @@ -223,11 +251,23 @@ public static String guessDomainName(String fileName) { case "rapsberry_pi_docs": return "raspberrypi.org"; case "ted": - return "ted.com"; + return "www.ted.com/search?q="; case "vikidia": - return "vikidia.org"; + return parts[1] + ".vikidia.org/wiki"; case "westeros": return "westeros.org"; + case "wikihow": + return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com"; + case "wikisource": + return parts[1] + ".wikisource.org/wiki"; + case "wikiversity": + return parts[1] + ".wikiversity.org/wiki"; + case "wikivoyage": + return parts[1] + ".wikivoyage.org/wiki"; + case "wiktionary": + return parts[1] + ".wiktionary.org/wiki"; + case "wikiquote": + return parts[1] + ".wikiquote.org/wiki"; case "wikibooks": return parts[1] + ".wikibooks.org/wiki"; case "wikinews": @@ -273,16 +313,148 @@ public static String getSource(ZIMReader r) throws IOException { return source; } - public static String guessURL(String guessedSource, DirectoryEntry de) { + public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException { String url = de.url; if (url.equals("Main_Page")) url = ""; - if (guessedSource != null) return guessedSource + url; - if (url.startsWith("A/")) return "https://" + url.substring(2); - if (url.startsWith("H/")) return "https://" + url.substring(2); - return guessedSource + url; + if (guessedSource != null) return new DigestURL(guessedSource + url); + if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2)); + if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2)); + return new DigestURL(guessedSource + url); } + private final static String[] skip_files = { + "iota.stackexchange.com_en_all_2023-05.zim", + "stellar.stackexchange.com_en_all_2023-10.zim", + "vegetarianism.stackexchange.com_en_all_2023-05.zim", + "esperanto.stackexchange.com_eo_all_2023-10.zim", + "tezos.stackexchange.com_en_all_2023-10.zim", + "eosio.stackexchange.com_en_all_2023-10.zim", + "ebooks.stackexchange.com_en_all_2023-10.zim", + "poker.stackexchange.com_en_all_2023-05.zim", + "cseducators.stackexchange.com_en_all_2023-10.zim", + "iot.stackexchange.com_en_all_2023-05.zim", + "portuguese.stackexchange.com_pt_all_2023-04.zim", + "portuguese.stackexchange.com_pt_all_2023-10.zim", + "italian.stackexchange.com_it_all_2023-05.zim", + "monero.stackexchange.com_en_all_2022-11.zim", + "sustainability.stackexchange.com_en_all_2023-05.zim", + "westeros_en_all_nopic_2021-03.zim", + "opensource.stackexchange.com_en_all_2023-10.zim", + "tor.stackexchange.com_en_all_2023-05.zim", + "devops.stackexchange.com_en_all_2023-10.zim", + "patents.stackexchange.com_en_all_2023-10.zim", + "stackapps.com_en_all_2023-05.zim", + "hardwarerecs.stackexchange.com_en_all_2023-05.zim", + "hsm.stackexchange.com_en_all_2023-05.zim", + "expatriates.stackexchange.com_en_all_2023-11.zim", + "opendata.stackexchange.com_en_all_2023-10.zim", + "sports.stackexchange.com_en_all_2023-05.zim", + "wikinews_de_all_nopic_2023-10.zim", + "computergraphics.stackexchange.com_en_all_2023-10.zim", + "tridion.stackexchange.com_en_all_2023-10.zim", + "bioinformatics.stackexchange.com_en_all_2023-10.zim", + "expressionengine.stackexchange.com_en_all_2023-11.zim", + "elementaryos.stackexchange.com_en_all_2023-10.zim", + "cstheory.stackexchange.com_en_all_2023-10.zim", + "chess.stackexchange.com_en_all_2023-05.zim", + "vi.stackexchange.com_en_all_2023-05.zim", + "fitness.stackexchange.com_en_all_2023-10.zim", + "pets.stackexchange.com_en_all_2023-05.zim", + "french.stackexchange.com_fr_all_2023-10.zim", + "sqa.stackexchange.com_en_all_2023-05.zim", + "islam.stackexchange.com_en_all_2023-05.zim", + "scicomp.stackexchange.com_en_all_2023-05.zim", + "wikinews_en_all_nopic_2023-09.zim", + "ai.stackexchange.com_en_all_2023-10.zim", + "boardgames.stackexchange.com_en_all_2023-05.zim", + "economics.stackexchange.com_en_all_2023-05.zim", + "3dprinting.stackexchange.com_en_all_2023-07.zim", + "earthscience.stackexchange.com_en_all_2023-05.zim", + "emacs.stackexchange.com_en_all_2023-10.zim", + "bitcoin.stackexchange.com_en_all_2023-05.zim", + "philosophy.stackexchange.com_en_all_2023-05.zim", + "law.stackexchange.com_en_all_2023-05.zim", + "astronomy.stackexchange.com_en_all_2023-05.zim", + "artofproblemsolving_en_all_nopic_2021-03.zim", + "engineering.stackexchange.com_en_all_2023-05.zim", + "ja.stackoverflow.com_ja_all_2023-06.zim", + "webmasters.stackexchange.com_en_all_2023-05.zim", + "anime.stackexchange.com_en_all_2023-10.zim", + "cooking.stackexchange.com_en_all_2023-05.zim", + "arduino.stackexchange.com_en_all_2023-05.zim", + "money.stackexchange.com_en_all_2023-05.zim", + "judaism.stackexchange.com_en_all_2023-05.zim", + "ethereum.stackexchange.com_en_all_2023-05.zim", + "datascience.stackexchange.com_en_all_2023-10.zim", + "academia.stackexchange.com_en_all_2023-10.zim", + "music.stackexchange.com_en_all_2023-05.zim", + "cs.stackexchange.com_en_all_2023-03.zim", + "dsp.stackexchange.com_en_all_2023-05.zim", + "biology.stackexchange.com_en_all_2023-05.zim", + "android.stackexchange.com_en_all_2023-10.zim", + "bicycles.stackexchange.com_en_all_2023-05.zim", + "puzzling.stackexchange.com_en_all_2023-05.zim", + "photo.stackexchange.com_en_all_2023-05.zim", + "aviation.stackexchange.com_en_all_2023-05.zim", + "drupal.stackexchange.com_en_all_2023-05.zim", + "ux.stackexchange.com_en_all_2023-05.zim", + "ell.stackexchange.com_en_all_2023-10.zim", + "openstreetmap-wiki_en_all_nopic_2023-05.zim", + "softwareengineering.stackexchange.com_en_all_2023-05.zim", + "gaming.stackexchange.com_en_all_2023-10.zim", + "mathematica.stackexchange.com_en_all_2023-10.zim", + "pt.stackoverflow.com_pt_all_2023-06.zim", + "apple.stackexchange.com_en_all_2023-05.zim", + "diy.stackexchange.com_en_all_2023-08.zim", + "es.stackoverflow.com_es_all_2023-06.zim", + "gis.stackexchange.com_en_all_2023-05.zim", + "stats.stackexchange.com_en_all_2023-05.zim", + "physics.stackexchange.com_en_all_2023-05.zim", + "serverfault.com_en_all_2023-05.zim", + "electronics.stackexchange.com_en_all_2023-05.zim", + "tex.stackexchange.com_en_all_2023-05.zim", + "wikibooks_de_all_nopic_2021-03.zim", + "askubuntu.com_en_all_2023-05.zim", + "superuser.com_en_all_2023-05.zim", + "lesfondamentaux.reseau-canope.fr_fr_all_2022-11.zim", + "wikibooks_en_all_nopic_2021-03.zim", + "courses.lumenlearning.com_en_all_2021-03.zim", + "wikipedia_de_all_nopic_2023-10.zim", + "wikipedia_en_all_nopic_2023-10.zim", + "stackoverflow.com_en_all_nopic_2022-07.zim", + "stackoverflow.com_en_all_2023-05.zim", + "armypubs_en_all_2023-08.zim", + "vikidia_en_all_nopic_2023-09.zim", + "wikiquote_de_all_nopic_2023-10.zim", + "wikiquote_en_all_nopic_2023-09.zim", + "wiktionary_de_all_nopic_2023-10.zim", + "wiktionary_en_all_nopic_2023-10.zim", + "wikihow_de_maxi_2023-10.zim", + "wikivoyage_de_all_nopic_2023-09.zim", + "wikiversity_de_all_nopic_2021-03.zim", + "wikiversity_en_all_nopic_2021-03.zim", + "wikisource_de_all_nopic_2023-09.zim", + "wikisource_en_all_nopic_2023-08.zim", + "ted_countdown_global_2023-09.zim", + "ted_en_design_2023-09.zim", + "ted_en_business_2023-09.zim", + "ted_en_global_issues_2023-09.zim", + + // 302 + "moderators.stackexchange.com_en_all_2023-05.zim", + "beer.stackexchange.com_en_all_2023-05.zim", + "health.stackexchange.com_en_all_2023-05.zim", + "avp.stackexchange.com_en_all_2023-05.zim", + "lowtechmagazine.com_en_all_2023-08.zim", + "ifixit_de_all_2023-07.zim", + "ifixit_en_all_2023-10.zim", + "der-postillon.com_de_all_2020-12.zim", + "wikihow_en_maxi_2023-03.zim", + }; + public static void main(String[] args) { + Set skip = new HashSet<>(); + for (String s: skip_files) skip.add(s); // zim file import test // will test mostly if domain names are included in zim file urls String zimFilesPath = args[0]; @@ -298,7 +470,10 @@ public static void main(String[] args) { } Collection orderedFiles = orderedFileMap.values(); + Set files_ok = new LinkedHashSet<>(); + Set files_nok = new LinkedHashSet<>(); for (File f: orderedFiles) { + if (skip.contains(f.getName())) continue; try { ZIMFile z = new ZIMFile(f.getAbsolutePath()); ZIMReader r = new ZIMReader(z); @@ -308,16 +483,20 @@ public static void main(String[] args) { System.out.println("Namespace: " + de.namespace); System.out.println("Title: " + de.title); System.out.println("URL: " + de.url); - System.out.println("guessed domain: " + guessDomainName(f.getName())); + System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name String source = getSource(r); - System.out.println("guessed Source: " + source); - String mainURL = guessURL(source, de); + System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file + DigestURL mainURL = guessURL(source, de); System.out.println("guessed main article: " + mainURL); - System.out.println("main article exists: " + new DigestURL(mainURL).exists(ClientIdentification.browserAgent)); + boolean ok = mainURL.exists(ClientIdentification.browserAgent); + System.out.println("main article exists: " + ok); + if (ok) files_ok.add(f.getName()); else files_nok.add(f.getName()); System.out.println(); } catch (IOException e) { e.printStackTrace(); } } + System.out.println("ok files: " + files_ok.toString()); + System.out.println("not-ok files: " + files_nok.toString()); } } diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 0ad6b2248..f02577244 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -53,7 +53,6 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; import org.apache.pdfbox.text.PDFTextStripper; -import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; @@ -69,9 +68,6 @@ public class pdfParser extends AbstractParser implements Parser { - public static boolean individualPages = false; - public static String individualPagePropertyname = "page"; - public pdfParser() { super("Acrobat Portable Document Parser"); this.SUPPORTED_EXTENSIONS.add("pdf"); @@ -149,98 +145,36 @@ public Document[] parse( // get the links final List> pdflinks = extractPdfLinks(pdfDoc); - // get the fulltext (either per document or for each page) - final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/); - - if (individualPages) { - // this is a hack which stores individual pages of the source pdf into individual index documents - // the new documents will get a virtual link with a post argument page=X appended to the original url - - // collect text - final int pagecount = pdfDoc.getNumberOfPages(); - final String[] pages = new String[pagecount]; - for (int page = 1; page <= pagecount; page++) { - stripper.setStartPage(page); - stripper.setEndPage(page); - pages[page - 1] = stripper.getText(pdfDoc); - //System.out.println("PAGE " + page + ": " + pages[page - 1]); - } - - // create individual documents for each page - assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size(); - result = new Document[Math.min(pages.length, pdflinks.size())]; - final String loc = location.toNormalform(true); - for (int page = 0; page < result.length; page++) { - result[page] = new Document( - new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash - mimeType, - StandardCharsets.UTF_8.name(), - this, - null, - docKeywords, - singleList(docTitle), - docAuthor, - docPublisher, - null, - null, - 0.0d, 0.0d, - pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), - pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page), - null, - null, - false, - docDate); - } - } else { - // collect the whole text at once - final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); - byte[] contentBytes = new byte[0]; - stripper.setEndPage(3); // get first 3 pages (always) - writer.append(stripper.getText(pdfDoc)); - contentBytes = writer.getBytes(); // remember text in case of interrupting thread - - if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read - stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) - stripper.setEndPage(Integer.MAX_VALUE); // set to default - // we start the pdf parsing in a separate thread to ensure that it can be terminated - final PDDocument pdfDocC = pdfDoc; - final Thread t = new Thread("pdfParser.getText:" + location) { - @Override - public void run() { - try { - writer.append(stripper.getText(pdfDocC)); - } catch (final Throwable e) {} - } - }; - t.start(); - t.join(3000); // pdfbox likes to forget to terminate ... (quite often) - if (t.isAlive()) t.interrupt(); - contentBytes = writer.getBytes(); // get final text before closing writer - writer.close(); // free writer resources - } - - final Collection pdflinksCombined = new HashSet<>(); - for (final Collection pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); - result = new Document[]{new Document( - location, - mimeType, - StandardCharsets.UTF_8.name(), - this, - null, - docKeywords, - singleList(docTitle), - docAuthor, - docPublisher, - null, - null, - 0.0d, 0.0d, - contentBytes, - pdflinksCombined, - null, - null, - false, - docDate)}; - } + // collect the whole text at once + final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); + byte[] contentBytes = new byte[0]; + final PDFTextStripper stripper = new PDFTextStripper(); + stripper.setEndPage(Integer.MAX_VALUE); + writer.append(stripper.getText(pdfDoc)); + contentBytes = writer.getBytes(); // remember text in case of interrupting thread + writer.close(); // free writer resources + + final Collection pdflinksCombined = new HashSet<>(); + for (final Collection pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); + result = new Document[]{new Document( + location, + mimeType, + StandardCharsets.UTF_8.name(), + this, + null, + docKeywords, + singleList(docTitle), + docAuthor, + docPublisher, + null, + null, + 0.0d, 0.0d, + contentBytes, + pdflinksCombined, + null, + null, + false, + docDate)}; } catch (final Throwable e) { //throw new Parser.Failure(e.getMessage(), location); } finally { diff --git a/source/net/yacy/htroot/ConfigParser_p.java b/source/net/yacy/htroot/ConfigParser_p.java index e466d783b..943279382 100644 --- a/source/net/yacy/htroot/ConfigParser_p.java +++ b/source/net/yacy/htroot/ConfigParser_p.java @@ -61,13 +61,6 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime()); env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension()); } - - if (post.containsKey("pdfSettings")) { - env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, post.getBoolean("individualPages")); - env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, post.get("individualPagePropertyname", "page")); - pdfParser.individualPages = sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false); - pdfParser.individualPagePropertyname = sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"); - } } int i = 0; @@ -94,9 +87,6 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea prop.put("parser", i); - prop.put("individualPages", sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false)); - prop.put("individualPagePropertyname", sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page")); - // return rewrite properties return prop; } diff --git a/source/net/yacy/htroot/Crawler_p.java b/source/net/yacy/htroot/Crawler_p.java index e95562713..8c898f558 100644 --- a/source/net/yacy/htroot/Crawler_p.java +++ b/source/net/yacy/htroot/Crawler_p.java @@ -774,7 +774,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje } /* - * PPM + * PPM LF MH @@ -784,19 +784,19 @@ public static serverObjects respond(final RequestHeader header, final serverObje if (post != null && post.containsKey("crawlingPerformance")) { final String crawlingPerformance = post.get("crawlingPerformance", "custom"); final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L); - int wantedPPM = (LCbusySleep1 == 0) ? 30000 : (int) (60000L / LCbusySleep1); + int wantedPPM = (LCbusySleep1 == 0) ? 60000 : (int) (60000L / LCbusySleep1); try { wantedPPM = post.getInt("customPPM", wantedPPM); } catch (final NumberFormatException e) {} if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10; - if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000; + if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 60000; int wPPM = wantedPPM; if ( wPPM <= 0 ) { wPPM = 1; } - if ( wPPM >= 30000 ) { - wPPM = 30000; + if ( wPPM >= 60000 ) { + wPPM = 60000; } final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60 diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 33b797524..2d93ec8b7 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -981,17 +981,7 @@ public URIMetadataNode makeResultEntry( public String urlstring() { if (this.alternative_urlstring != null) return this.alternative_urlstring; - if (!pdfParser.individualPages) return this.url().toNormalform(true); - if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.url().getFileName()).toLowerCase(Locale.ROOT))) return this.url().toNormalform(true); - // for pdf links we rewrite the url - // this is a special treatment of pdf files which can be splitted into subpages - String pageprop = pdfParser.individualPagePropertyname; - String resultUrlstring = this.url().toNormalform(true); - int p = resultUrlstring.lastIndexOf(pageprop + "="); - if (p > 0) { - return resultUrlstring.substring(0, p - 1) + "#page=" + resultUrlstring.substring(p + pageprop.length() + 1); - } - return resultUrlstring; + return this.url().toNormalform(true); } /** * used for search result entry diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 39f856ea3..bebd16cbd 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -176,6 +176,7 @@ import net.yacy.document.importer.JsonListImporter; import net.yacy.document.importer.OAIListFriendsLoader; import net.yacy.document.importer.WarcImporter; +import net.yacy.document.importer.ZimImporter; import net.yacy.document.parser.audioTagParser; import net.yacy.document.parser.pdfParser; import net.yacy.document.parser.html.Evaluation; @@ -906,8 +907,6 @@ public void run() { TextParser.setDenyMime(this.getConfig(SwitchboardConstants.PARSER_MIME_DENY, "")); TextParser.setDenyExtension(this.getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, "")); - pdfParser.individualPages = this.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false); - pdfParser.individualPagePropertyname = this.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"); // start a loader this.log.config("Starting Crawl Loader"); @@ -2153,6 +2152,20 @@ public boolean processSurrogate(final String s) { this.log.warn("IO Error processing warc file " + infile); } return moved; + } else if (s.endsWith(".zim")) { + try { + final ZimImporter wri = new ZimImporter(infile.getAbsolutePath()); + wri.start(); + try { + wri.join(); + } catch (final InterruptedException ex) { + return moved; + } + moved = infile.renameTo(outfile); + } catch (final IOException ex) { + this.log.warn("IO Error processing zim file " + infile); + } + return moved; } else if (s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson")) { return this.processSurrogateJson(infile, outfile); } @@ -2349,6 +2362,7 @@ public boolean surrogateProcess() { if ( surrogate.endsWith(".xml") || surrogate.endsWith(".xml.gz") || surrogate.endsWith(".xml.zip") + || surrogate.endsWith(".zim") || surrogate.endsWith(".warc") || surrogate.endsWith(".warc.gz") || surrogate.endsWith(".jsonlist") diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index b871291cf..fedef45e7 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -220,8 +220,6 @@ public final class SwitchboardConstants { public static final String INDEX_TRANSFER_GZIP_BODY = "indexTransfer.gzipBody"; public static final String PARSER_MIME_DENY = "parser.mime.deny"; public static final String PARSER_EXTENSIONS_DENY = "parser.extensions.deny"; - public static final String PARSER_PDF_INDIVIDUALPAGES = "parser.pdf.individualpages"; - public static final String PARSER_PDF_INDIVIDUALPAGES_KEY = "parser.pdf.individualpages.key"; /** *

public static final String PROXY_ONLINE_CAUTION_DELAY = "onlineCautionDelay"

*

Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds

diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java index 906bf30a9..a241507ab 100644 --- a/source/org/openzim/ZIMFile.java +++ b/source/org/openzim/ZIMFile.java @@ -113,20 +113,24 @@ public ZIMFile(final String path) throws IOException { } this.mimeTypeList = mList.toArray(new String[mList.size()]); - // Initialize the Url Pointer List - this.urlPtrListBlob = new byte[this.header_entryCount * 8]; - mReader.seek(this.header_urlPtrPos); - RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob); - - // Initialize the Title Pointer List - this.titlePtrListBlob = new byte[this.header_entryCount * 4]; - mReader.seek(this.header_titlePtrPos); - RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob); - - // Initialize the Cluster Pointer List - this.clusterPtrListBlob = new byte[this.header_clusterCount * 8]; - mReader.seek(this.header_clusterPtrPos); - RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob); + try { + // Initialize the Url Pointer List + this.urlPtrListBlob = new byte[this.header_entryCount * 8]; + mReader.seek(this.header_urlPtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob); + + // Initialize the Title Pointer List + this.titlePtrListBlob = new byte[this.header_entryCount * 4]; + mReader.seek(this.header_titlePtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob); + + // Initialize the Cluster Pointer List + this.clusterPtrListBlob = new byte[this.header_clusterCount * 8]; + mReader.seek(this.header_clusterPtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob); + } catch (IndexOutOfBoundsException e) { + throw new IOException(e.getMessage()); + } } public final String getMimeType(int idx) {