Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ Supported files:

The plugin caches the extracted texts as local small `.json` files inside the plugin directory. Those files can be synced between your devices. Since text extraction does not work on mobile, the plugin will use the synced cached texts if available. If not, an empty string will be returned.




## Installation

Text Extractor is available on the [Obsidian community plugins repository](https://obsidian.md/plugins?search=Text%20Extractor). You can also install it manually by downloading the latest release from the [releases page](https://github.com/scambier/obsidian-text-extractor/releases) or by using the [BRAT plugin manager](https://github.com/TfTHacker/obsidian42-brat).
Expand Down Expand Up @@ -83,9 +80,26 @@ Note that Text Extractor only extract texts _on demand_, when you call `extractT

While this plugin is first developed for Omnisearch, it's totally agnostic and I'd like it to become a community effort. If you wish to submit a PR, please open an issue first so we can discuss the feature.

Learn the basics of Obsidian plugin development at https://docs.obsidian.md/Plugins/Getting+started/Build+a+plugin

The plugin is split in two parts:

- The text extraction library, which does the actual work
- The plugin itself, which is a wrapper around the library and exposes some useful options to the user

Each project is in its own folder, and has its own `package.json` and `node_modules`. The library uses Rollup (easier to setup with Wasm and web workers), while the plugin uses esbuild.

```
obsidian_vault_path=<path_to_your_obsidian_vault>
plugin_path="${obsidian_vault_path}/.obsidian/plugins/text-extractor"
cd lib
pnpm install
pnpm run build
cd ../plugin
pnpm install
pnpm run build
cd ..
mkdir -p $plugin_path
cp manifest.json $plugin_path
cp plugin/dist/* $plugin_path
```
6 changes: 3 additions & 3 deletions lib/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
"main": "dist/index.js",
"types": "dist/src/index.d.ts",
"scripts": {
"dev": "wasm-pack build --target web && rollup -c -w",
"build": "wasm-pack build --target web && rollup -c",
"dev": "rollup -c -w",
"build": "rollup -c",
"test": "vitest",
"coverage": "vitest run --coverage",
"prepublishOnly": "pnpm run build"
Expand Down Expand Up @@ -54,4 +54,4 @@
"tesseract.js": "^3.0.3",
"xlsx": "https://cdn.sheetjs.com/xlsx-0.20.1/xlsx-0.20.1.tgz"
}
}
}
2 changes: 2 additions & 0 deletions lib/src/globals.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ let PdfBackgroundProcess = Math.max(
if (PdfBackgroundProcess == cpuCount) {
PdfBackgroundProcess = 1
}
// PDF.js crashes with an out-of-memory error if too many workers are used
//PdfBackgroundProcess = 1

console.info(
`Text Extractor - Number of available workers: ${PdfBackgroundProcess} for PDFs, ${ocrBackgroundProcesses} for OCR, ${officeBackgroundProcesses} for Office`
Expand Down
98 changes: 34 additions & 64 deletions lib/src/pdf/pdf-manager.ts
Original file line number Diff line number Diff line change
@@ -1,55 +1,12 @@
import { Platform, TFile } from 'obsidian'
import { Platform, TFile, loadPdfJs } from 'obsidian'
import WebWorker from 'web-worker:./pdf-worker.ts'
import {
CANT_EXTRACT_ON_MOBILE,
FAILED_TO_EXTRACT,
pdfProcessQueue,
workerTimeout,
} from '../globals'
import { getCachePath, readCache, writeCache } from '../cache'

class PDFWorker {
static #pool: PDFWorker[] = []
#running = false

private constructor(private worker: Worker) {}

static getWorker(): PDFWorker {
const free = PDFWorker.#pool.find(w => !w.#running)
if (free) {
return free
}
// Spawn a new worker
const worker = new PDFWorker(new WebWorker({ name: 'PDF Text Extractor' }))
PDFWorker.#pool.push(worker)
return worker
}

static #destroyWorker(pdfWorker: PDFWorker) {
pdfWorker.worker.terminate()
PDFWorker.#pool = PDFWorker.#pool.filter(w => w !== pdfWorker)
}

public async run(msg: { data: Uint8Array; name: string }): Promise<any> {
return new Promise((resolve, reject) => {
this.#running = true

const timeout = setTimeout(() => {
console.warn('Text Extractor - PDF Worker timeout for ', msg.name)
reject('timeout')
PDFWorker.#destroyWorker(this)
}, workerTimeout)

this.worker.postMessage(msg)
this.worker.onmessage = evt => {
clearTimeout(timeout)
resolve(evt)
this.#running = false
}
})
}
}

class PDFManager {
public async getPdfText(file: TFile): Promise<string> {
try {
Expand Down Expand Up @@ -77,28 +34,41 @@ class PDFManager {
// The PDF is not cached, extract it
const cachePath = getCachePath(file)
const data = new Uint8Array(await app.vault.readBinary(file))
const worker = PDFWorker.getWorker()

return new Promise(async (resolve, reject) => {
try {
const res = await worker.run({ data, name: file.basename })
const text = (res.data.text as string)
// Replace \n with spaces
.replace(/\n/g, ' ')
// Trim multiple spaces
.replace(/ +/g, ' ')
.trim()
// Load the PDF.js library
const pdfjs = await loadPdfJs()
const loadingTask = pdfjs.getDocument({ data })
const pdf = await loadingTask.promise
const pagePromises = [];
// Get text from each page of the PDF
for (let j = 1; j <= pdf.numPages; j++) {
const page = pdf.getPage(j);

// @ts-ignore
pagePromises.push(page.then((page) => {
// @ts-ignore
const textContentPromise: Promise<{ items }> = page.getTextContent();
return textContentPromise.then((t) => {
const items = t.items;
let text = '';
for (let i = 0; i < items.length; i++) {
text += items[i].str + ' ';
}
return text;
});
}));
}

const texts = await Promise.all(pagePromises);

// Add it to the cache
const text = texts.join(' ');
await writeCache(cachePath.folder, cachePath.filename, text, file.path, '')
console.info(`Text Extractor - Extracted text from ${file.basename}. ${pdfProcessQueue.size} PDF extract jobs still queued.`)
// Add a pause to allow garbage collection to run more often
//await new Promise((resolve) => setTimeout(resolve, 5000))

// Add it to the cache
await writeCache(cachePath.folder, cachePath.filename, text, file.path, '')
resolve(text)
} catch (e) {
// In case of error (unreadable PDF or timeout) just add
// an empty string to the cache
await writeCache(cachePath.folder, cachePath.filename, '', file.path, '')
resolve('')
}
})
return text
}
}

Expand Down
21 changes: 0 additions & 21 deletions lib/src/pdf/pdf-worker.ts

This file was deleted.