scambier · figadore · May 4, 2024 · May 4, 2024 · May 5, 2024 · May 5, 2024
diff --git a/README.md b/README.md
@@ -39,9 +39,6 @@ Supported files:
 
 The plugin caches the extracted texts as local small `.json` files inside the plugin directory. Those files can be synced between your devices. Since text extraction does not work on mobile, the plugin will use the synced cached texts if available. If not, an empty string will be returned.
 
-
-
-
 ## Installation
 
 Text Extractor is available on the [Obsidian community plugins repository](https://obsidian.md/plugins?search=Text%20Extractor). You can also install it manually by downloading the latest release from the [releases page](https://github.com/scambier/obsidian-text-extractor/releases) or by using the [BRAT plugin manager](https://github.com/TfTHacker/obsidian42-brat).
@@ -83,9 +80,26 @@ Note that Text Extractor only extract texts _on demand_, when you call `extractT
 
 While this plugin is first developed for Omnisearch, it's totally agnostic and I'd like it to become a community effort. If you wish to submit a PR, please open an issue first so we can discuss the feature.
 
+Learn the basics of Obsidian plugin development at https://docs.obsidian.md/Plugins/Getting+started/Build+a+plugin
+
 The plugin is split in two parts:
 
 - The text extraction library, which does the actual work
 - The plugin itself, which is a wrapper around the library and exposes some useful options to the user
 
 Each project is in its own folder, and has its own `package.json` and `node_modules`. The library uses Rollup (easier to setup with Wasm and web workers), while the plugin uses esbuild.
+
+```
+obsidian_vault_path=<path_to_your_obsidian_vault>
+plugin_path="${obsidian_vault_path}/.obsidian/plugins/text-extractor"
+cd lib
+pnpm install
+pnpm run build
+cd ../plugin
+pnpm install
+pnpm run build
+cd ..
+mkdir -p $plugin_path
+cp manifest.json $plugin_path
+cp plugin/dist/* $plugin_path
+```
diff --git a/lib/package.json b/lib/package.json
@@ -10,8 +10,8 @@
   "main": "dist/index.js",
   "types": "dist/src/index.d.ts",
   "scripts": {
-    "dev": "wasm-pack build --target web && rollup -c -w",
-    "build": "wasm-pack build --target web && rollup -c",
+    "dev": "rollup -c -w",
+    "build": "rollup -c",
     "test": "vitest",
     "coverage": "vitest run --coverage",
     "prepublishOnly": "pnpm run build"
@@ -54,4 +54,4 @@
     "tesseract.js": "^3.0.3",
     "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.1/xlsx-0.20.1.tgz"
   }
-}
+}
diff --git a/lib/src/globals.ts b/lib/src/globals.ts
@@ -17,6 +17,8 @@ let PdfBackgroundProcess = Math.max(
 if (PdfBackgroundProcess == cpuCount) {
   PdfBackgroundProcess = 1
 }
+// PDF.js crashes with an out-of-memory error if too many workers are used
+//PdfBackgroundProcess = 1
 
 console.info(
   `Text Extractor - Number of available workers: ${PdfBackgroundProcess} for PDFs, ${ocrBackgroundProcesses} for OCR, ${officeBackgroundProcesses} for Office`

diff --git a/lib/src/pdf/pdf-manager.ts b/lib/src/pdf/pdf-manager.ts
@@ -1,55 +1,12 @@
-import { Platform, TFile } from 'obsidian'
+import { Platform, TFile, loadPdfJs } from 'obsidian'
 import WebWorker from 'web-worker:./pdf-worker.ts'
 import {
   CANT_EXTRACT_ON_MOBILE,
   FAILED_TO_EXTRACT,
   pdfProcessQueue,
-  workerTimeout,
 } from '../globals'
 import { getCachePath, readCache, writeCache } from '../cache'
 
-class PDFWorker {
-  static #pool: PDFWorker[] = []
-  #running = false
-
-  private constructor(private worker: Worker) {}
-
-  static getWorker(): PDFWorker {
-    const free = PDFWorker.#pool.find(w => !w.#running)
-    if (free) {
-      return free
-    }
-    // Spawn a new worker
-    const worker = new PDFWorker(new WebWorker({ name: 'PDF Text Extractor' }))
-    PDFWorker.#pool.push(worker)
-    return worker
-  }
-
-  static #destroyWorker(pdfWorker: PDFWorker) {
-    pdfWorker.worker.terminate()
-    PDFWorker.#pool = PDFWorker.#pool.filter(w => w !== pdfWorker)
-  }
-
-  public async run(msg: { data: Uint8Array; name: string }): Promise<any> {
-    return new Promise((resolve, reject) => {
-      this.#running = true
-
-      const timeout = setTimeout(() => {
-        console.warn('Text Extractor - PDF Worker timeout for ', msg.name)
-        reject('timeout')
-        PDFWorker.#destroyWorker(this)
-      }, workerTimeout)
-
-      this.worker.postMessage(msg)
-      this.worker.onmessage = evt => {
-        clearTimeout(timeout)
-        resolve(evt)
-        this.#running = false
-      }
-    })
-  }
-}
-
 class PDFManager {
   public async getPdfText(file: TFile): Promise<string> {
     try {
@@ -77,28 +34,41 @@ class PDFManager {
     // The PDF is not cached, extract it
     const cachePath = getCachePath(file)
     const data = new Uint8Array(await app.vault.readBinary(file))
-    const worker = PDFWorker.getWorker()
 
-    return new Promise(async (resolve, reject) => {
-      try {
-        const res = await worker.run({ data, name: file.basename })
-        const text = (res.data.text as string)
-          // Replace \n with spaces
-          .replace(/\n/g, ' ')
-          // Trim multiple spaces
-          .replace(/ +/g, ' ')
-          .trim()
+    // Load the PDF.js library
+    const pdfjs = await loadPdfJs()
+    const loadingTask = pdfjs.getDocument({ data })
+    const pdf = await loadingTask.promise
+    const pagePromises = [];
+    // Get text from each page of the PDF
+    for (let j = 1; j <= pdf.numPages; j++) {
+      const page = pdf.getPage(j);
+
+      // @ts-ignore
+      pagePromises.push(page.then((page) => {
+        // @ts-ignore
+        const textContentPromise: Promise<{ items }> = page.getTextContent();
+        return textContentPromise.then((t) => {
+          const items = t.items;
+          let text = '';
+          for (let i = 0; i < items.length; i++) {
+            text += items[i].str + ' ';
+          }
+          return text;
+        });
+      }));
+    }
+
+    const texts = await Promise.all(pagePromises);
+
+    // Add it to the cache
+    const text = texts.join(' ');
+    await writeCache(cachePath.folder, cachePath.filename, text, file.path, '')
+    console.info(`Text Extractor - Extracted text from ${file.basename}. ${pdfProcessQueue.size} PDF extract jobs still queued.`)
+    // Add a pause to allow garbage collection to run more often
+    //await new Promise((resolve) => setTimeout(resolve, 5000))
 
-        // Add it to the cache
-        await writeCache(cachePath.folder, cachePath.filename, text, file.path, '')
-        resolve(text)
-      } catch (e) {
-        // In case of error (unreadable PDF or timeout) just add
-        // an empty string to the cache
-        await writeCache(cachePath.folder, cachePath.filename, '', file.path, '')
-        resolve('')
-      }
-    })
+    return text
   }
 }
 

diff --git a/lib/src/pdf/pdf-worker.ts b/lib/src/pdf/pdf-worker.ts