diff --git a/app/lib/extract.ts b/app/lib/extract.ts new file mode 100644 index 0000000..de7cbe9 --- /dev/null +++ b/app/lib/extract.ts @@ -0,0 +1,33 @@ +import { Collection } from "@arsonar/client/index.js"; +import { PDFExtract, PDFExtractOptions } from 'pdf.js-extract' +import { Readable } from 'node:stream' +import { file as createTempFile } from 'tmp-promise' +import fs from 'fs/promises' + +export type ExtractResult = { + fullText?: string +} + +export async function extractTextFromFile(collection: Collection, fileId: string): Promise { + const fileStream = await collection.files.readFile(fileId) + const { path, cleanup } = await createTempFile() + await fs.writeFile(path, Readable.from(fileStream)) + const result: ExtractResult = {} + try { + const pdfExtract = new PDFExtract(); + const data = await pdfExtract.extract(path) + let text = '' + // @ts-ignore + if (data.meta?.metadata && data.meta.metadata['dc:title']) text += data.meta.metadata['dc:title'] + for (const page of data.pages) { + for (const area of page.content) { + text += ' ' + area.str + } + } + result.fullText = text + } catch (err) { + console.error('Text extraction failed', err) + } + await cleanup() + return result +} diff --git a/app/schema.tsx b/app/schema.tsx index 1cbac7e..d039b57 100644 --- a/app/schema.tsx +++ b/app/schema.tsx @@ -71,6 +71,9 @@ export const schema: Spec = { file: { type: 'string', }, + fullText: { + type: 'string' + } }, }, }, diff --git a/app/sonar.server.ts b/app/sonar.server.ts index dc93d75..5573415 100644 --- a/app/sonar.server.ts +++ b/app/sonar.server.ts @@ -2,6 +2,7 @@ import { Workspace } from '@arsonar/client' import type { Collection } from '@arsonar/client' import Dotenv from 'dotenv' import { schema } from './schema' +import { extractTextFromFile } from './lib/extract.js' Dotenv.config() @@ -43,8 +44,11 @@ export async function openCollection(): Promise { return collection } -export async function createBookRecord(data: typeof schema.types.Book.fields) { +export async function createBookRecord(data: any) { const collection = await openCollection() + const fileId = data.file + const fullText = await extractTextFromFile(collection, fileId) + if (fullText.fullText) data.fullText = fullText.fullText const record = await collection.put({ type: 'Book', value: data, diff --git a/package.json b/package.json index 8d558cb..34806cb 100644 --- a/package.json +++ b/package.json @@ -22,12 +22,14 @@ "autoprefixer": "^10.4.7", "dotenv": "^16.0.1", "npm-run-all": "^4.1.5", + "pdf.js-extract": "^0.2.0", "postcss": "^8.4.14", "react": "^17.0.2", "react-dom": "^17.0.2", "react-icons": "^4.4.0", "simple-isbn": "^1.1.5", - "tailwindcss": "^3.1.4" + "tailwindcss": "^3.1.4", + "tmp-promise": "^3.0.3" }, "devDependencies": { "@remix-run/dev": "^1.5.1", diff --git a/yarn.lock b/yarn.lock index fd55465..098dd40 100644 --- a/yarn.lock +++ b/yarn.lock @@ -3458,6 +3458,11 @@ dom-accessibility-api@^0.5.9: resolved "https://registry.yarnpkg.com/dom-accessibility-api/-/dom-accessibility-api-0.5.14.tgz#56082f71b1dc7aac69d83c4285eef39c15d93f56" integrity sha512-NMt+m9zFMPZe0JcY9gN224Qvk6qLIdqex29clBvc/y75ZBX9YA9wNK3frsYvu2DI1xcCIwxwnX+TlsJ2DSOADg== +dommatrix@0.0.24: + version "0.0.24" + resolved "https://registry.yarnpkg.com/dommatrix/-/dommatrix-0.0.24.tgz#0b793da372992878b74c31f64ac85e7a13cb382a" + integrity sha512-PatEhAW5pIHr28MvFQGV5iiHNloqvecQZlxs7/8s/eulLqZI3uVqPkrO7YDuqsebovr/9mmcWDSWzVG4amEZgQ== + dotenv@^16.0.0, dotenv@^16.0.1: version "16.0.1" resolved "https://registry.yarnpkg.com/dotenv/-/dotenv-16.0.1.tgz#8f8f9d94876c35dac989876a5d3a82a267fdce1d" @@ -7711,6 +7716,14 @@ path-type@^4.0.0: resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b" integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw== +pdf.js-extract@^0.2.0: + version "0.2.0" + resolved "https://registry.yarnpkg.com/pdf.js-extract/-/pdf.js-extract-0.2.0.tgz#162f1aefae6b1fe2aa683057fcd9fd80d44ff58c" + integrity sha512-oUcah0BEsiAtO25SrhG3jzihmiYw9bq+oH0T5NcomKu8Y2pU5j1xco2JPWV2i2vl38nf8cL/MiBu+Fcog6I07A== + dependencies: + dommatrix "0.0.24" + web-streams-polyfill "3.2.0" + peek-stream@^1.1.0: version "1.1.3" resolved "https://registry.yarnpkg.com/peek-stream/-/peek-stream-1.1.3.tgz#3b35d84b7ccbbd262fff31dc10da56856ead6d67" @@ -9755,7 +9768,7 @@ timeout-refresh@^1.0.0, timeout-refresh@^1.0.2, timeout-refresh@^1.0.3: resolved "https://registry.yarnpkg.com/timeout-refresh/-/timeout-refresh-1.0.3.tgz#7024a8ce0a09a57acc2ea86002048e6c0bff7375" integrity sha512-Mz0CX4vBGM5lj8ttbIFt7o4ZMxk/9rgudJRh76EvB7xXZMur7T/cjRiH2w4Fmkq0zxf2QpM8IFvOSRn8FEu3gA== -tmp-promise@^3.0.2: +tmp-promise@^3.0.2, tmp-promise@^3.0.3: version "3.0.3" resolved "https://registry.yarnpkg.com/tmp-promise/-/tmp-promise-3.0.3.tgz#60a1a1cc98c988674fcbfd23b6e3367bdeac4ce7" integrity sha512-RwM7MoPojPxsOBYnyd2hy0bxtIlVrihNs9pj5SUvY8Zz1sQcQG2tG1hSr8PDxfgEB8RNKDhqbIlroIarSNDNsQ== @@ -10307,6 +10320,11 @@ web-encoding@1.1.5: optionalDependencies: "@zxing/text-encoding" "0.9.0" +web-streams-polyfill@3.2.0: + version "3.2.0" + resolved "https://registry.yarnpkg.com/web-streams-polyfill/-/web-streams-polyfill-3.2.0.tgz#a6b74026b38e4885869fb5c589e90b95ccfc7965" + integrity sha512-EqPmREeOzttaLRm5HS7io98goBgZ7IVz79aDvqjD0kYXLtFZTc0T/U6wHTPKyIjb+MdN7DFIIX6hgdBEpWmfPA== + web-streams-polyfill@^3.1.1: version "3.2.1" resolved "https://registry.yarnpkg.com/web-streams-polyfill/-/web-streams-polyfill-3.2.1.tgz#71c2718c52b45fd49dbeee88634b3a60ceab42a6"