-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdocument.rs
604 lines (547 loc) · 19.2 KB
/
document.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
use crate::{
auth::UserInfo, comment::Comment, date::DateInput, slugify, AnnotatedForm, AudioSlice,
Contributor, Database, Date, SourceAttribution, Translation, TranslationBlock,
};
use async_graphql::{dataloader::DataLoader, FieldResult, MaybeUndefined};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
/// A document with associated metadata and content broken down into pages and further into
/// paragraphs with an English translation. Also supports each word being broken down into
/// component parts and having associated notes.
#[derive(Serialize, Deserialize, Clone)]
pub struct AnnotatedDoc {
/// All non-content metadata about this document
#[serde(flatten)]
pub meta: DocumentMetadata,
/// The meat of the document, all the pages which contain its contents.
pub segments: Option<Vec<TranslatedPage>>,
}
impl AnnotatedDoc {
/// Build a document from its metadata and raw contents.
pub fn new(meta: DocumentMetadata, segments: Vec<Vec<Vec<AnnotatedSeg>>>) -> Self {
// Skip the first block of the translation, since this usually contains
// the header and information for translators and editors.
let blocks = &meta
.translation
.as_ref()
.unwrap_or_else(|| panic!("Missing translation for {}", meta.short_name))
.paragraphs;
let mut pages = Vec::new();
let mut paragraph_index = 0;
for page in segments {
let mut paragraphs = Vec::new();
for paragraph in page {
if paragraph_index > 0 {
let trans = blocks.get(paragraph_index);
paragraphs.push(TranslatedSection {
translation: trans.map(TranslationBlock::get_text),
source: paragraph,
});
}
paragraph_index += 1;
}
pages.push(TranslatedPage { paragraphs });
}
Self {
segments: Some(pages),
meta,
}
}
}
#[async_graphql::Object]
impl AnnotatedDoc {
/// Official short identifier for this document
async fn id(&self) -> DocumentId {
self.meta.id
}
/// Full title of the document
async fn title(&self) -> &str {
&self.meta.title
}
/// Date and time this document was written or created
async fn date(&self) -> &Option<Date> {
&self.meta.date
}
/// When the document was bookmarked by the current user, if it was.
async fn bookmarked_on(
&self,
context: &async_graphql::Context<'_>,
) -> FieldResult<Option<Date>> {
if let Some(user) = context.data_opt::<UserInfo>() {
Ok(context
.data::<DataLoader<Database>>()?
.loader()
.get_document_bookmarked_on(&self.meta.id.0, &user.id)
.await?)
} else {
Ok(None)
}
}
/// The original source(s) of this document, the most important first.
async fn sources(&self) -> &[SourceAttribution] {
&self.meta.sources
}
/// Where the source document came from, maybe the name of a collection
async fn collection(&self) -> Option<DocumentCollection> {
self.meta
.collection
.as_ref()
.map(|name| DocumentCollection::from_name(name.to_owned()))
}
/// The genre of the document, used to group similar ones
async fn genre(&self) -> &Option<String> {
&self.meta.genre
}
/// Images of each source document page, in order
async fn page_images(&self) -> &Option<IiifImages> {
&self.meta.page_images
}
/// The people involved in producing this document, including the original
/// author, translators, and annotators
async fn contributors(
&self,
context: &async_graphql::Context<'_>,
) -> FieldResult<Vec<Contributor>> {
Ok(context
.data::<DataLoader<Database>>()?
.load_one(crate::ContributorsForDocument(self.meta.id.0))
.await?
.unwrap_or_default())
}
/// Is this document a reference source (unstructured list of words)?
/// Otherwise, it is considered a structured document with a translation.
async fn is_reference(&self) -> bool {
self.meta.is_reference
}
/// The audio recording resource for this entire document
async fn audio_recording(&self) -> &Option<AudioSlice> {
// TODO: Allow for multiple audio sources
&self.meta.audio_recording
}
/// Arbitrary number used for manually ordering documents in a collection.
/// For collections without manual ordering, use zero here.
async fn order_index(&self) -> i64 {
self.meta.order_index
}
/// URL-ready slug for this document, generated from the title
async fn slug(&self) -> String {
slug::slugify(&self.meta.short_name)
}
/// Segments of the document paired with their respective rough translations
async fn translated_pages(
&self,
context: &async_graphql::Context<'_>,
) -> FieldResult<Option<Vec<DocumentPage>>> {
Ok(context
.data::<DataLoader<Database>>()?
.load_one(PagesInDocument(self.meta.id.0))
.await?)
}
/// All the words contained in this document, dropping structural formatting
/// like line and page breaks.
async fn forms(
&self,
context: &async_graphql::Context<'_>,
start: Option<i64>,
end: Option<i64>,
) -> FieldResult<Vec<AnnotatedForm>> {
Ok(context
.data::<DataLoader<Database>>()?
.loader()
.words_in_document(self.meta.id, start, end)
.await?
.collect())
}
async fn form_count(&self, context: &async_graphql::Context<'_>) -> FieldResult<i64> {
Ok(context
.data::<DataLoader<Database>>()?
.loader()
.count_words_in_document(self.meta.id)
.await?)
}
/// All words in the document that have unanalyzed or unfamiliar parts.
/// These words need to be corrected or reviewed further.
async fn unresolved_forms(
&self,
context: &async_graphql::Context<'_>,
) -> FieldResult<Vec<AnnotatedForm>> {
let forms = context
.data::<DataLoader<Database>>()?
.loader()
.words_in_document(self.meta.id, None, None)
.await?;
Ok(forms.filter(AnnotatedForm::is_unresolved).collect())
}
/// Collection chapters that contain this document.
async fn chapters(
&self,
context: &async_graphql::Context<'_>,
) -> FieldResult<Option<Vec<crate::CollectionChapter>>> {
Ok(context
.data::<DataLoader<Database>>()?
.loader()
.chapters_by_document(self.meta.short_name.clone())
.await?)
}
}
/// Key to retrieve the pages of a document given a document ID
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub struct PagesInDocument(pub Uuid);
/// One page of an [`AnnotatedDoc`]
#[derive(Clone)]
pub struct DocumentPage {
/// Database ID
pub id: Uuid,
/// One-indexed page number
pub page_number: String,
/// Resource of the image of this page
pub image: Option<PageImage>,
}
#[async_graphql::Object]
impl DocumentPage {
/// One-indexed page number
async fn page_number(&self) -> &str {
&self.page_number
}
/// Scan of this page as a IIIF resource, if there is one
async fn image(&self) -> &Option<PageImage> {
&self.image
}
/// Contents of this page as a list of paragraphs
async fn paragraphs(
&self,
context: &async_graphql::Context<'_>,
) -> FieldResult<Vec<DocumentParagraph>> {
Ok(context
.data::<DataLoader<Database>>()?
.load_one(ParagraphsInPage(self.id))
.await?
.unwrap_or_default())
}
}
/// Page ID meant for retrieving all paragraphs within.
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub struct ParagraphsInPage(pub Uuid);
/// One paragraph within a [`DocumentPage`]
#[derive(async_graphql::SimpleObject, Clone)]
#[graphql(complex)]
pub struct DocumentParagraph {
/// Unique identifier for this paragraph
pub id: Uuid,
/// English translation of the whole paragraph
pub translation: String,
/// 1-indexed position of this paragraph in a document
pub index: i64,
}
/// A paragraph in an annotated document that can be edited.
#[derive(async_graphql::InputObject)]
pub struct ParagraphUpdate {
/// Unique identifier of the form
pub id: Uuid,
/// English translation of the paragraph
pub translation: MaybeUndefined<String>,
}
/// Update the contributor attribution for a document
#[derive(async_graphql::InputObject)]
pub struct UpdateContributorAttribution {
pub document_id: Uuid,
pub contributor_id: Uuid,
pub contribution_role: String,
}
/// Delete a contributor attribution for a document based on the two ids
#[derive(async_graphql::InputObject)]
pub struct DeleteContributorAttribution {
pub document_id: Uuid,
pub contributor_id: Uuid,
}
/// Used for updating document metadata.
/// All fields except id are optional.
#[derive(async_graphql::InputObject)]
pub struct DocumentMetadataUpdate {
pub id: Uuid,
pub title: MaybeUndefined<String>,
pub written_at: MaybeUndefined<DateInput>,
}
#[async_graphql::ComplexObject]
impl DocumentParagraph {
/// Source text of the paragraph broken down into words
async fn source(&self, context: &async_graphql::Context<'_>) -> FieldResult<Vec<AnnotatedSeg>> {
Ok(context
.data::<DataLoader<Database>>()?
.load_one(WordsInParagraph(self.id))
.await?
.unwrap_or_default())
}
/// Get comments on this paragraph
async fn comments(&self, context: &async_graphql::Context<'_>) -> FieldResult<Vec<Comment>> {
let db = context.data::<DataLoader<Database>>()?.loader();
Ok(db
.comments_by_parent(&self.id, &crate::comment::CommentParentType::Paragraph)
.await?)
}
}
/// Key to query the words within a paragraph given its database ID
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub struct WordsInParagraph(pub Uuid);
/// The kind of a document in terms of what body it lives within. A reference
/// document is a dictionary or grammar for example, while a corpus document
/// might be a letter, journal, or notice.
#[derive(async_graphql::Enum, Clone, Copy, PartialEq, Eq)]
pub enum DocumentType {
/// Reference document, like a dictionary or grammar
Reference,
/// Corpus text: a letter, journal, book, story, meeting minutes, etc.
Corpus,
}
/// One page of a document containing one or more paragraphs
#[derive(async_graphql::SimpleObject, Serialize, Deserialize, Clone)]
pub struct TranslatedPage {
/// The paragraphs of content that make up this single page
pub paragraphs: Vec<TranslatedSection>,
}
/// A single document image from a IIIF source
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PageImage {
/// Database ID of the image source
pub source_id: ImageSourceId,
/// Remote IIIF OID of the image
pub oid: String,
}
#[async_graphql::Object]
impl PageImage {
/// The IIIF source this page image comes from
pub async fn source(
&self,
context: &async_graphql::Context<'_>,
) -> async_graphql::FieldResult<ImageSource> {
Ok(context
.data::<DataLoader<Database>>()?
.load_one(self.source_id.clone())
.await?
.ok_or_else(|| anyhow::format_err!("Image source not found"))?)
}
/// The full IIIF url for this image resource
pub async fn url(
&self,
context: &async_graphql::Context<'_>,
) -> async_graphql::FieldResult<String> {
let source = self.source(context).await?;
Ok(format!("{}/{}", source.url, self.oid))
}
}
/// One paragraph within a document with source text and overall English translation.
#[derive(async_graphql::SimpleObject, Serialize, Deserialize, Clone)]
pub struct TranslatedSection {
/// Translation of this portion of the source text.
pub translation: Option<String>,
/// Source text from the original document.
pub source: Vec<AnnotatedSeg>,
}
// Ideal structure:
// documents: [{ meta, pages: [{ lines: [{ index, words }] }] }]
// Basic to start: [{meta, lines: [{ index, words }]}]
/// Element within a spreadsheet before being transformed into a full document.
#[derive(Debug, async_graphql::Union, Serialize, Deserialize, Clone)]
#[serde(tag = "type")]
pub enum AnnotatedSeg {
/// A single annotated word
Word(AnnotatedForm),
/// The beginning of a new line
LineBreak(LineBreak),
// PageBreak(PageBreak),
}
impl AnnotatedSeg {
/// If this segment is a word, return the inner [`AnnotatedForm`] otherwise `None`.
pub fn form(&self) -> Option<&AnnotatedForm> {
use AnnotatedSeg::*;
match self {
Word(w) => Some(w),
LineBreak(_) => None,
// PageBreak(_) => None,
}
}
}
/// Start of a new line
#[derive(Debug, async_graphql::SimpleObject, Serialize, Deserialize, Clone)]
pub struct LineBreak {
/// Index of this line break within the document. i.e. Indicates the start
/// of line X.
pub index: i32,
}
/// Start of a new page
#[derive(Debug, async_graphql::SimpleObject, Serialize, Deserialize, Clone)]
pub struct PageBreak {
/// Index of this page break within the document. i.e. Indicates the start
/// of page X.
pub index: i32,
}
/// All the metadata associated with one particular document.
/// TODO Make more of these fields on-demand.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct DocumentMetadata {
/// Database ID
pub id: DocumentId,
/// Official short identifier.
pub short_name: String,
/// Full title of the document.
pub title: String,
/// Further details about this particular document.
// pub details: String,
#[serde(default)]
/// The original source(s) of this document, the most important first.
pub sources: Vec<SourceAttribution>,
/// Where the source document came from, maybe the name of a collection.
pub collection: Option<String>,
/// The genre this document is. TODO Evaluate whether we need this.
pub genre: Option<String>,
#[serde(default)]
/// The people involved in collecting, translating, annotating.
pub contributors: Vec<Contributor>,
/// Rough translation of the document, broken down by paragraph.
#[serde(skip)]
pub translation: Option<Translation>,
/// URL for an image of the original physical document.
#[serde(default)]
pub page_images: Option<IiifImages>,
/// The date this document was produced (or `None` if unknown)
pub date: Option<Date>,
/// Whether this document is a reference, therefore just a list of forms.
pub is_reference: bool,
/// Audio recording of this document, if one exists
#[serde(default)]
pub audio_recording: Option<AudioSlice>,
#[serde(default)]
/// Arbitrary number used for manually ordering documents in a collection.
/// For collections without manual ordering, use zero here.
pub order_index: i64,
}
/// Database ID for one document
#[derive(
Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, Debug, async_graphql::NewType, Default,
)]
pub struct DocumentId(pub Uuid);
/// Database ID for an image source
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct ImageSourceId(pub Uuid);
/// A IIIF server we use as an image source
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ImageSource {
/// Database ID for this source
pub id: ImageSourceId,
/// Base URL for the IIIF server
pub url: String,
}
#[async_graphql::Object]
impl ImageSource {
/// Base URL for the IIIF server
async fn url(&self) -> &str {
&self.url
}
}
/// Collection of images coming from a IIIF source. Generally used to represent
/// the scans of multi-page manuscripts sourced from libraries/archives.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct IiifImages {
/// Database ID for the image source
pub source: ImageSourceId,
/// Remote IIIF OIDs for the images
pub ids: Vec<String>,
}
impl IiifImages {
/// Number of images in this collection
pub fn count(&self) -> usize {
self.ids.len()
}
}
#[async_graphql::Object]
impl IiifImages {
/// Information about the data source for this set of images
pub async fn source(
&self,
context: &async_graphql::Context<'_>,
) -> async_graphql::FieldResult<ImageSource> {
Ok(context
.data::<DataLoader<Database>>()?
.load_one(self.source.clone())
.await?
.ok_or_else(|| anyhow::format_err!("Image source not found"))?)
}
/// List of urls for all the images in this collection
async fn urls(
&self,
context: &async_graphql::Context<'_>,
) -> async_graphql::FieldResult<Vec<String>> {
let source = self.source(context).await?;
Ok(self
.ids
.iter()
.map(|id| format!("{}/{}", source.url, id))
.collect())
}
}
/// Reference to a document collection
#[derive(Clone, Serialize, Deserialize)]
pub struct DocumentCollection {
/// General title of the collection
pub title: String,
/// Unique slug used to generate URL paths
pub slug: String,
}
impl DocumentCollection {
/// Create a collection reference using the given title and generating a
/// slug from it.
pub fn from_name(name: String) -> Self {
Self {
slug: slug::slugify(&name),
title: name,
}
}
}
#[async_graphql::Object]
impl DocumentCollection {
/// Full name of this collection
async fn name(&self) -> &str {
&self.title
}
/// URL-ready slug for this collection, generated from the name
async fn slug(&self) -> String {
slugify(&self.slug)
}
/// All documents that are part of this collection
/// TODO Try to unify this return type into AnnotatedDoc
/// This probably requires adding a document_ids field so that we can just
/// pass that to the dataloader below.
async fn documents(
&self,
context: &async_graphql::Context<'_>,
) -> async_graphql::FieldResult<Vec<DocumentReference>> {
Ok(context
.data::<DataLoader<Database>>()?
.loader()
.documents_in_collection("", &self.slug)
.await?)
}
}
/// Reference to a document with a limited subset of fields, namely no contents
/// of the document.
#[derive(Clone, async_graphql::SimpleObject)]
#[graphql(complex)]
pub struct DocumentReference {
/// Database ID for the document
pub id: Uuid,
/// Unique short name
pub short_name: String,
/// Long title of the document
pub title: String,
/// Date the document was produced (or `None` if unknown)
pub date: Option<Date>,
/// Index of the document within its group, used purely for ordering
pub order_index: i64,
}
#[async_graphql::ComplexObject]
impl DocumentReference {
/// URL slug for this document
pub async fn slug(&self) -> String {
slug::slugify(&self.short_name)
}
}