diff --git a/README.md b/README.md index 73b220a..77bd64b 100644 --- a/README.md +++ b/README.md @@ -39,9 +39,6 @@ Supported files: The plugin caches the extracted texts as local small `.json` files inside the plugin directory. Those files can be synced between your devices. Since text extraction does not work on mobile, the plugin will use the synced cached texts if available. If not, an empty string will be returned. - - - ## Installation Text Extractor is available on the [Obsidian community plugins repository](https://obsidian.md/plugins?search=Text%20Extractor). You can also install it manually by downloading the latest release from the [releases page](https://github.com/scambier/obsidian-text-extractor/releases) or by using the [BRAT plugin manager](https://github.com/TfTHacker/obsidian42-brat). @@ -83,9 +80,26 @@ Note that Text Extractor only extract texts _on demand_, when you call `extractT While this plugin is first developed for Omnisearch, it's totally agnostic and I'd like it to become a community effort. If you wish to submit a PR, please open an issue first so we can discuss the feature. +Learn the basics of Obsidian plugin development at https://docs.obsidian.md/Plugins/Getting+started/Build+a+plugin + The plugin is split in two parts: - The text extraction library, which does the actual work - The plugin itself, which is a wrapper around the library and exposes some useful options to the user Each project is in its own folder, and has its own `package.json` and `node_modules`. The library uses Rollup (easier to setup with Wasm and web workers), while the plugin uses esbuild. + +``` +obsidian_vault_path= +plugin_path="${obsidian_vault_path}/.obsidian/plugins/text-extractor" +cd lib +pnpm install +pnpm run build +cd ../plugin +pnpm install +pnpm run build +cd .. +mkdir -p $plugin_path +cp manifest.json $plugin_path +cp plugin/dist/* $plugin_path +``` \ No newline at end of file diff --git a/lib/package.json b/lib/package.json index 311aa3c..1036b4d 100644 --- a/lib/package.json +++ b/lib/package.json @@ -10,8 +10,8 @@ "main": "dist/index.js", "types": "dist/src/index.d.ts", "scripts": { - "dev": "wasm-pack build --target web && rollup -c -w", - "build": "wasm-pack build --target web && rollup -c", + "dev": "rollup -c -w", + "build": "rollup -c", "test": "vitest", "coverage": "vitest run --coverage", "prepublishOnly": "pnpm run build" @@ -54,4 +54,4 @@ "tesseract.js": "^3.0.3", "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.1/xlsx-0.20.1.tgz" } -} +} \ No newline at end of file diff --git a/lib/src/globals.ts b/lib/src/globals.ts index 64e1cd8..5e2b231 100644 --- a/lib/src/globals.ts +++ b/lib/src/globals.ts @@ -17,6 +17,8 @@ let PdfBackgroundProcess = Math.max( if (PdfBackgroundProcess == cpuCount) { PdfBackgroundProcess = 1 } +// PDF.js crashes with an out-of-memory error if too many workers are used +//PdfBackgroundProcess = 1 console.info( `Text Extractor - Number of available workers: ${PdfBackgroundProcess} for PDFs, ${ocrBackgroundProcesses} for OCR, ${officeBackgroundProcesses} for Office` diff --git a/lib/src/pdf/pdf-manager.ts b/lib/src/pdf/pdf-manager.ts index 9f58b34..8cdcb5f 100644 --- a/lib/src/pdf/pdf-manager.ts +++ b/lib/src/pdf/pdf-manager.ts @@ -1,55 +1,12 @@ -import { Platform, TFile } from 'obsidian' +import { Platform, TFile, loadPdfJs } from 'obsidian' import WebWorker from 'web-worker:./pdf-worker.ts' import { CANT_EXTRACT_ON_MOBILE, FAILED_TO_EXTRACT, pdfProcessQueue, - workerTimeout, } from '../globals' import { getCachePath, readCache, writeCache } from '../cache' -class PDFWorker { - static #pool: PDFWorker[] = [] - #running = false - - private constructor(private worker: Worker) {} - - static getWorker(): PDFWorker { - const free = PDFWorker.#pool.find(w => !w.#running) - if (free) { - return free - } - // Spawn a new worker - const worker = new PDFWorker(new WebWorker({ name: 'PDF Text Extractor' })) - PDFWorker.#pool.push(worker) - return worker - } - - static #destroyWorker(pdfWorker: PDFWorker) { - pdfWorker.worker.terminate() - PDFWorker.#pool = PDFWorker.#pool.filter(w => w !== pdfWorker) - } - - public async run(msg: { data: Uint8Array; name: string }): Promise { - return new Promise((resolve, reject) => { - this.#running = true - - const timeout = setTimeout(() => { - console.warn('Text Extractor - PDF Worker timeout for ', msg.name) - reject('timeout') - PDFWorker.#destroyWorker(this) - }, workerTimeout) - - this.worker.postMessage(msg) - this.worker.onmessage = evt => { - clearTimeout(timeout) - resolve(evt) - this.#running = false - } - }) - } -} - class PDFManager { public async getPdfText(file: TFile): Promise { try { @@ -77,28 +34,41 @@ class PDFManager { // The PDF is not cached, extract it const cachePath = getCachePath(file) const data = new Uint8Array(await app.vault.readBinary(file)) - const worker = PDFWorker.getWorker() - return new Promise(async (resolve, reject) => { - try { - const res = await worker.run({ data, name: file.basename }) - const text = (res.data.text as string) - // Replace \n with spaces - .replace(/\n/g, ' ') - // Trim multiple spaces - .replace(/ +/g, ' ') - .trim() + // Load the PDF.js library + const pdfjs = await loadPdfJs() + const loadingTask = pdfjs.getDocument({ data }) + const pdf = await loadingTask.promise + const pagePromises = []; + // Get text from each page of the PDF + for (let j = 1; j <= pdf.numPages; j++) { + const page = pdf.getPage(j); + + // @ts-ignore + pagePromises.push(page.then((page) => { + // @ts-ignore + const textContentPromise: Promise<{ items }> = page.getTextContent(); + return textContentPromise.then((t) => { + const items = t.items; + let text = ''; + for (let i = 0; i < items.length; i++) { + text += items[i].str + ' '; + } + return text; + }); + })); + } + + const texts = await Promise.all(pagePromises); + + // Add it to the cache + const text = texts.join(' '); + await writeCache(cachePath.folder, cachePath.filename, text, file.path, '') + console.info(`Text Extractor - Extracted text from ${file.basename}. ${pdfProcessQueue.size} PDF extract jobs still queued.`) + // Add a pause to allow garbage collection to run more often + //await new Promise((resolve) => setTimeout(resolve, 5000)) - // Add it to the cache - await writeCache(cachePath.folder, cachePath.filename, text, file.path, '') - resolve(text) - } catch (e) { - // In case of error (unreadable PDF or timeout) just add - // an empty string to the cache - await writeCache(cachePath.folder, cachePath.filename, '', file.path, '') - resolve('') - } - }) + return text } } diff --git a/lib/src/pdf/pdf-worker.ts b/lib/src/pdf/pdf-worker.ts deleted file mode 100644 index eea8dec..0000000 --- a/lib/src/pdf/pdf-worker.ts +++ /dev/null @@ -1,21 +0,0 @@ -import rustPlugin from '../../pkg/obsidian_text_extract_bg.wasm' -import * as plugin from '../../pkg' - -const decodedPlugin = decodeBase64(rustPlugin as any) - -onmessage = async evt => { - const buffer = Uint8Array.from(decodedPlugin, c => c.charCodeAt(0)) - await plugin.default(Promise.resolve(buffer)) - try { - const text = plugin.extract_pdf_text(evt.data.data as Uint8Array) - self.postMessage({ text }) - } catch (e) { - console.info('Text Extractor - Could not extract text from ' + evt.data.name) - self.postMessage({ text: '' }) - } -} - -function decodeBase64(data: string) { - return atob(data) - // return Buffer.from(data, 'base64').toString() -}