diff --git a/README.md b/README.md new file mode 100644 index 0000000..53dc0be --- /dev/null +++ b/README.md @@ -0,0 +1,63 @@ +# oimdp: OpenITI mARkdown Parser + +This Python library will parse an [OpenITI mARkdown](https://alraqmiyyat.github.io/mARkdown/) document and return a python class +representation of the document structures. + +## Usage + +```py +import oimdp + +md_file = open("mARkdownfile", "r") +text = md_file.read() +md_file.close() +parsed = oimdp.parse(text) +``` + +## Parsed structure + +Please see [the docs](https://openiti.github.io/oimdp/), but here are some highlights: + +### Document API + +`content`: a list of content structures + +`get_clean_text()`: get the text stripped of markup + +### Content structures + +`Content` classes contain an original value from the document and some extracted content such as a text string or a specific value. + +Most other structures are listed in sequence (e.g. a `Paragraph` is followed by a `Line`). + +`Line` objects and other line-level structures are divided in `PhrasePart` objects. + +`PhrasePart` are phrase-level tags + +## Develop + +Set up a virtual environment with `venv` + +```py +python3 -m venv .env +``` + +Activate the virtual environment + +```py +source .env/bin/activate +``` + +Install + +```py +python setup.py install +``` + +## Tests + +With the environment activated: + +```py +python tests/test.py +``` \ No newline at end of file diff --git a/oimdp/parser.py b/oimdp/parser.py new file mode 100644 index 0000000..4b0603e --- /dev/null +++ b/oimdp/parser.py @@ -0,0 +1,328 @@ +import sys +import re +from .structures import Age, Date, Document, Hemistich, Hukm, Isnad, Matn, NamedEntity, OpenTagAuto, OpenTagUser, PageNumber, Paragraph, Line, RouteDist, RouteFrom, RouteTowa, Verse, Milestone +from .structures import SectionHeader, Editorial, DictionaryUnit, BioOrEvent +from .structures import DoxographicalItem, MorphologicalPattern, TextPart +from .structures import AdministrativeRegion, RouteOrDistance, Riwayat +from . import tags as t + +PAGE_PATTERN = re.compile(r"PageV(\d+)P(\d+)") +OPEN_TAG_CUSTOM_PATTERN = r"@[^@]+?@[^_@]+?_[^_@]+?(?:_[^_@]+?)?@" +OPEN_TAG_CUSTOM_PATTERN_GROUPED = re.compile( + r"@([^@]+?)@([^_@]+?)_([^_@]+?)(_([^_@]+?))?@" +) +OPEN_TAG_AUTO_PATTERN = r"@[A-Z]{3}@[A-Z]{3,}@[A-Za-z]+@(?:-@[0tf][ftalmr]@)?" +OPEN_TAG_AUTO_PATTERN_GROUPED = re.compile( + r"@([A-Z]{3})@([A-Z]{3,})@([A-Za-z]+)@(-@([0tf][ftalmr])@)?" +) +YEAR_PATTERN = [rf"{t.YEAR_AGE}\d{{1,4}}", rf"{t.YEAR_DEATH}\d{{1,4}}", rf"{t.YEAR_BIRTH}\d{{1,4}}", rf"{t.YEAR_OTHER}\d{{1,4}}"] +TOP_PATTERN = [rf"{t.TOP_FULL}\d{{1,2}}", rf"{t.TOP}\d{{1,2}}"] +PER_PATTERN = [rf"{t.PER_FULL}\d{{1,2}}", rf"{t.PER}\d{{1,2}}"] +SOC_PATTERN = [rf"{t.SOC_FULL}\d{{1,2}}", rf"{t.SOC}\d{{1,2}}"] +NAMED_ENTITIES_PATTERN = [*YEAR_PATTERN, *TOP_PATTERN, *PER_PATTERN, rf"{t.SRC}\d{{1,2}}", *SOC_PATTERN] + + +def parse_tags(s: str): + return s + + +def remove_phrase_lv_tags(s: str): + text_only = s + for tag in t.PHRASE_LV_TAGS: + text_only = text_only.replace(tag, '') + for tag in NAMED_ENTITIES_PATTERN: + text_only = re.compile(tag).sub('', text_only) + # Open tag + text_only = OPEN_TAG_CUSTOM_PATTERN_GROUPED.sub('', text_only) + text_only = OPEN_TAG_AUTO_PATTERN_GROUPED.sub('', text_only) + text_only = PAGE_PATTERN.sub('', text_only) + return text_only + + +def parse_line(tagged_il: str, index: int, obj=Line, first_token=None): + """ parse a line text into LineParts by splitting it by tags and patterns """ + # remove line tag + il = tagged_il.replace(t.LINE, '') + + # get clean text + text_only = il + text_only = remove_phrase_lv_tags(text_only) + + if text_only == "": + return None + + line = obj(il, text_only) + + # Split the line by tags. Make sure patterns do not include subgroups! + tokens = re.split(rf"(PageV\d+P\d+|{OPEN_TAG_AUTO_PATTERN}|{OPEN_TAG_CUSTOM_PATTERN}|{'|'.join([re.escape(t) for t in t.PHRASE_LV_TAGS])}|{'|'.join([t for t in NAMED_ENTITIES_PATTERN])})", il) + + # Some structures inject a token at the beginning of a line, like a riwāyaŧ's isnād + if first_token: + line.add_part(first_token("")) + + # Named entities include in their `text` property a given number of words from the following text token + # This variable is used to keep track. A "word" is just a space-separated token. + include_words = 0 + + for token in tokens: + if token == '': + continue + + opentag_match = None + opentagauto_match = None + + if token.startswith('@'): + opentag_match = OPEN_TAG_CUSTOM_PATTERN_GROUPED.match(token) + opentagauto_match = OPEN_TAG_AUTO_PATTERN_GROUPED.match(token) + + if t.PAGE in token: + m = PAGE_PATTERN.search(token) + try: + line.add_part(PageNumber(token, m.group(1), m.group(2))) + except Exception: + raise Exception( + 'Could not parse page number at line: ' + str(index+1) + ) + elif opentag_match: + line.add_part(OpenTagUser(token, + opentag_match.group(1), # user + opentag_match.group(2), # t_type + opentag_match.group(3), # t_subtype + opentag_match.group(5))) # t_subsubtype + elif opentagauto_match: + line.add_part(OpenTagAuto(token, + opentagauto_match.group(1), # resp + opentagauto_match.group(2), # t_type + opentagauto_match.group(3), # category + opentagauto_match.group(5))) # review + elif t.HEMI in token: + line.add_part(Hemistich(token)) + elif t.MILESTONE in token: + line.add_part(Milestone(token)) + elif t.MATN in token: + line.add_part(Matn(token)) + elif t.HUKM in token: + line.add_part(Hukm(token)) + elif t.ROUTE_FROM in token: + line.add_part(RouteFrom(token)) + elif t.ROUTE_TOWA in token: + line.add_part(RouteTowa(token)) + elif t.ROUTE_DIST in token: + line.add_part(RouteDist(token)) + elif t.YEAR_BIRTH in token: + line.add_part(Date(token, token.replace(t.YEAR_BIRTH, ''), 'birth')) + elif t.YEAR_DEATH in token: + line.add_part(Date(token, token.replace(t.YEAR_DEATH, ''), 'death')) + elif t.YEAR_OTHER in token: + line.add_part(Date(token, token.replace(t.YEAR_OTHER, ''), 'other')) + elif t.YEAR_AGE in token: + line.add_part(Age(token, token.replace(t.YEAR_AGE, ''))) + elif t.SRC in token: + val = token.replace(t.SRC, '') + include_words = int(val[1]) + line.add_part(NamedEntity(token, int(val[0]), include_words, "", 'src')) + elif t.SOC_FULL in token: + val = token.replace(t.SOC_FULL, '') + include_words = int(val[1]) + line.add_part(NamedEntity(token, int(val[0]), include_words, "", 'soc')) + elif t.SOC in token in token: + val = token.replace(t.SOC, '') + include_words = int(val[1]) + line.add_part(NamedEntity(token, int(val[0]), include_words, "", 'soc')) + elif t.TOP_FULL in token: + val = token.replace(t.TOP_FULL, '') + include_words = int(val[1]) + line.add_part(NamedEntity(token, int(val[0]), include_words, "", 'top')) + elif t.TOP in token: + val = token.replace(t.TOP, '') + include_words = int(val[1]) + line.add_part(NamedEntity(token, int(val[0]), include_words, "", 'top')) + elif t.PER_FULL in token: + val = token.replace(t.PER_FULL, '') + include_words = int(val[1]) + line.add_part(NamedEntity(token, int(val[0]), include_words, "", 'per')) + elif t.PER in token: + val = token.replace(t.PER, '') + include_words = int(val[1]) + line.add_part(NamedEntity(token, int(val[0]), include_words, "", 'per')) + else: + if include_words > 0: + rest = "" + words = token.strip().split() + for pos, word in enumerate(reversed(words)): # reversing split for r-t-l script + if (pos < include_words): + line.parts[-1].text = line.parts[-1].text + word + " " + else: + rest = rest + word + " " + if len(rest): + line.add_part(TextPart(rest)) + include_words = 0 + else: + line.add_part(TextPart(token)) + return line + + +def parser(text: str): + """Parses an OpenITI mARkdown file and returns a Document object""" + document = Document(text) + + # Split input text into lines + ilines = text.splitlines() + + # Magic value + magic_value = ilines[0] + + if magic_value.strip() != "######OpenITI#": + raise Exception( + "This does not appear to be an OpenITI mARkdown document") + sys.exit(1) + + document.set_magic_value(magic_value) + + # RE patterns + para_pattern = re.compile(r"^#($|[^#])") + bio_pattern = re.compile(rf"{re.escape(t.BIO_MAN)}[^#]") + morpho_pattern = re.compile(r"#~:([^:]+?):") + region_pattern = re.compile( + rf"({t.PROV}|{t.REG}\d) .*? {t.GEO_TYPE} .*? ({t.REG}\d|{t.STTL}) ([\w# ]+) $" + ) + + # Input lines loop + for i, il in enumerate(ilines): + + # N.B. the order of if statements matters! + # We're doing string matching and tag elements are re-used. + + # Non-machine readable metadata + if (il.startswith(t.META)): + if (il.strip() == t.METAEND): + continue + value = il.split(t.META, 1)[1].strip() + document.set_simple_metadata_field(il, value) + + # Content-level page numbers + elif (il.startswith(t.PAGE)): + pv = PAGE_PATTERN.search(il) + try: + document.add_content(PageNumber(il, pv.group(1), pv.group(2))) + except Exception: + raise Exception( + 'Could not parse page number at line: ' + str(i+1) + ) + + # Riwāyāt units + elif (il.startswith(t.RWY)): + # Set first line, skipping para marker "# $RWY$" + document.add_content(Riwayat()) + first_line = parse_line(il[7:], i, first_token=Isnad) + if first_line: + document.add_content(first_line) + + # Routes + elif (il.startswith(t.ROUTE_FROM)): + document.add_content(parse_line(il, i, RouteOrDistance)) + + # Morphological pattern + elif (morpho_pattern.search(il)): + m = morpho_pattern.search(il) + document.add_content(MorphologicalPattern(il, m.group(1))) + + # Paragraphs and lines of verse + elif (para_pattern.search(il)): + if (t.HEMI in il): + # this is a verse line, skip para marker "#" + document.add_content(parse_line(il[1:], i, Verse)) + else: + document.add_content(Paragraph()) + first_line = parse_line(il[1:], i) + if first_line: + document.add_content(first_line) + + # Lines + elif (il.startswith(t.LINE)): + document.add_content(parse_line(il, i)) + + # Editorial section + elif (il.startswith(t.EDITORIAL)): + document.add_content(Editorial(il)) + + # Section headers + elif (il.startswith(t.HEADER1)): + value = il + for tag in t.HEADERS: + value = value.replace(tag, '') + # remove other phrase level tags + value = remove_phrase_lv_tags(value) + # TODO: capture tags as PhraseParts + level = 1 + if (t.HEADER5 in il): + level = 5 + elif (t.HEADER4 in il): + level = 4 + elif (t.HEADER3 in il): + level = 3 + elif (t.HEADER2 in il): + level = 2 + + document.add_content(SectionHeader(il, value, level)) + + # Dictionary entry + elif (il.startswith(t.DIC)): + no_tag = il + for tag in t.DICTIONARIES: + no_tag = no_tag.replace(tag, '') + first_line = parse_line(no_tag, i) + dic_type = "bib" + if (t.DIC_LEX in il): + dic_type = "lex" + elif (t.DIC_NIS in il): + dic_type = "nis" + elif (t.DIC_TOP in il): + dic_type = "top" + document.add_content(DictionaryUnit(il, dic_type)) + if first_line: + document.add_content(first_line) + + # Doxographical item + elif (il.startswith(t.DOX)): + no_tag = il + for tag in t.DOXOGRAPHICAL: + no_tag = no_tag.replace(tag, '') + first_line = parse_line(no_tag, i) + dox_type = "pos" + if (t.DOX_SEC in il): + dox_type = "sec" + document.add_content(DoxographicalItem(il, dox_type)) + if first_line: + document.add_content(first_line) + + # Biographies and Events + elif (bio_pattern.search(il) or il.startswith(t.BIO) or il.startswith(t.EVENT)): + no_tag = il + for tag in t.BIOS_EVENTS: + no_tag = no_tag.replace(tag, '') + first_line = parse_line(no_tag, i) + be_type = "man" + # Ordered from longer to shorter string to aid matching. I.e. ### $$$ before ### $$ + if (t.LIST_NAMES_FULL in il or t.LIST_NAMES in il): + be_type = "names" + elif (t.BIO_REF_FULL in il or t.BIO_REF in il): + be_type = "ref" + elif (t.BIO_WOM_FULL in il or t.BIO_WOM in il): + be_type = "wom" + elif (t.LIST_EVENTS in il): + be_type = "events" + elif (t.EVENT in il): + be_type = "event" + document.add_content(BioOrEvent(il, be_type)) + if first_line: + document.add_content(first_line) + + # Regions + elif (region_pattern.search(il)): + document.add_content(AdministrativeRegion(il)) + + else: + continue + + return document diff --git a/oimdp/structures.py b/oimdp/structures.py new file mode 100644 index 0000000..46cb679 --- /dev/null +++ b/oimdp/structures.py @@ -0,0 +1,287 @@ +from typing import List, Literal + + +class MagicValue: + """Magic Value of OpenITI mARkdown file""" + def __init__(self, orig: str): + self.orig = orig + self.value = "######OpenITI#" + + def __str__(self): + return self.value + + +class SimpleMetadataField: + """A non-machine readable metadata field""" + def __init__(self, orig: str, value: str): + self.orig = orig + self.value = value + + def __str__(self): + return self.value + + +class LinePart: + """A line-level tag""" + def __init__(self, orig: str): + self.orig = orig + + def __str__(self): + return self.orig + + +class TextPart(LinePart): + """Phrase-level text""" + def __init__(self, orig: str): + self.orig = orig + self.text = orig + + def __str__(self): + return self.text + + +class Date(LinePart): + """A date in running text""" + def __init__(self, orig: str, value: str, date_type: str): + self.orig = orig + self.value = value + self.date_type: Literal["birth", "death", "age", "other"] = date_type + + def __str__(self): + return self.orig + +class Age(LinePart): + """A number indicating age in running text""" + def __init__(self, orig: str, value: str): + self.orig = orig + self.value = value + + def __str__(self): + return self.orig + +class NamedEntity(LinePart): + """A named entity""" + def __init__(self, orig: str, prefix: int, extent: int, text: str, ne_type: str): + self.orig = orig + self.text = text + self.prefix = prefix + self.extent = extent + self.ne_type: Literal["top", "per", "soc", "src"] = ne_type + + def __str__(self): + return self.text + + +class OpenTagUser(LinePart): + """A custom tag added by a specific user""" + def __init__(self, orig: str, user: str, t_type: str, t_subtype: str, t_subsubtype: str): + self.orig = orig + self.user = user + self.t_type = t_type + self.t_subtype = t_subtype + self.t_subsubtype = t_subsubtype + + def __str__(self): + return self.value + + +class OpenTagAuto(LinePart): + """A custom tag added automatically""" + def __init__(self, orig: str, resp: str, t_type: str, category: str, review: str): + self.orig = orig + self.resp = resp + self.t_type = t_type + self.category = category + self.review = review + + def __str__(self): + return self.value + + +class Milestone(LinePart): + """Milestone typically used for splitting text in 300-word blocks""" + def __str__(self): + return "" + + +class Isnad(LinePart): + """An isnād part of a riwāyaŧ unit""" + + +class Matn(LinePart): + """A matn part of a riwāyaŧ unit""" + + +class Hukm(LinePart): + """A ḥukm part of a riwāyaŧ unit""" + + +class Line: + """A line of text that may contain parts""" + def __init__(self, orig: str, text_only: str, parts: List[LinePart] = None): + self.orig = orig + self.text_only = text_only + if (parts is None): + self.parts = [] + else: + self.parts = parts + + def add_part(self, part: LinePart): + self.parts.append(part) + + def __str__(self): + return "".join([str(p) for p in self.parts]) + + +class PageNumber(): + """A page and volume number. Can be Content or LinePart object""" + def __init__(self, orig: str, vol: str, page: str): + self.orig = orig + self.page = page + self.volume = vol + + def __str__(self): + return f"Vol. {self.volume}, p. {self.page}" + + +class Content: + """A content structure""" + def __init__(self, orig: str): + self.orig = orig + + def __str__(self): + return self.orig + + +class Verse(Line): + """A line of poetry""" + +class Hemistich(LinePart): + """Tags the beginning of a hemistic in a verse""" + + +class Paragraph(Content): + """Marks the beginning of a paragraph""" + def __init__(self, orig = "#"): + self.orig = orig + + def __str__(self): + return "" + +class SectionHeader(Content): + """A section header""" + def __init__(self, orig: str, value: str, level: int): + self.orig = orig + self.value = value + self.level = level + + def __str__(self): + return self.value + + +class Editorial(Content): + """Marks the beginning of an editorial section""" + def __init__(self, orig: str): + self.orig = orig + + def __str__(self): + return "" + + +class DictionaryUnit(Content): + """Marks a dictionary unit""" + def __init__(self, orig: str, dic_type: str): + self.orig = orig + self.dic_type: Literal["nit", "top", "lex", "bib"] = dic_type + + def __str__(self): + return "" + + +class BioOrEvent(Content): + """Marks a biography or an event""" + def __init__(self, orig: str, be_type: str): + self.orig = orig + self.be_type: Literal["man", "wom", "ref", "names", "event", "events"] = be_type + + def __str__(self): + return "" + + +class DoxographicalItem(Content): + """Marks a doxographical section""" + def __init__(self, orig: str, dox_type: str): + self.orig = orig + self.dox_type: Literal["pos", "sec"] = dox_type + + def __str__(self): + return self.value + + +class MorphologicalPattern(Content): + """A milestone to tag passages that can be categorized thematically.""" + def __init__(self, orig: str, category: str): + self.orig = orig + self.category = category + + def __str__(self): + return "" + + +class AdministrativeRegion(Content): + """An administrative region""" + # TODO + + def __str__(self): + return "" + + +class RouteOrDistance(Line): + """A route or distance""" + + +class RouteFrom(LinePart): + """Origin of a Route""" + + +class RouteTowa(LinePart): + """Destination of a Route""" + + +class RouteDist(LinePart): + """Distance of a Route""" + + +class Riwayat(Paragraph): + """Riwāyāt unit""" + + +class Document: + """The OpenITI mARkdown document""" + def __init__(self, text): + self.orig_text = text + self.simple_metadata = [] + self.content = [] + + def set_magic_value(self, orig: str): + self.magic_value = MagicValue(orig) + + def set_simple_metadata_field(self, orig: str, value: str): + self.simple_metadata.append(SimpleMetadataField(orig, value)) + + def add_content(self, content: Content): + self.content.append(content) + + def get_clean_text(self, includeMetadata: bool = False): + text = "" + if (includeMetadata): + text += "Metadata:\n" + text += "\n".join([str(md) for md in self.simple_metadata]) + text += "\n\n" + + text += "\n".join([str(c) for c in self.content]) + + return text + + def __str__(self): + return self.orig_text diff --git a/tests/batch.py b/tests/batch.py new file mode 100644 index 0000000..1298482 --- /dev/null +++ b/tests/batch.py @@ -0,0 +1,33 @@ +import sys +import os +import urllib.request +# import traceback +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) +import oimdp + + +if __name__ == "__main__": + + response = urllib.request.urlopen( + "https://raw.githubusercontent.com/OpenITI/RELEASE/master/OpenITI_metatdata_2019_1_1" + ) + release = response.read() + release = release.decode('utf-8') + + for line in release.split("\n"): + url = line.split('\t')[7] + if (url.endswith('mARkdown') or url.endswith('completed')): + # get file from GitHub + print("Parsing " + url) + try: + response = urllib.request.urlopen(url) + data = response.read() + text = data.decode('utf-8') + try: + oimdp.parse(text) + except Exception as identifier: + # print(traceback.format_exc()) + print("\tERR: ", identifier) + except Exception as identifier: + print("\t", identifier) diff --git a/tests/test.py b/tests/test.py new file mode 100644 index 0000000..ccf89bb --- /dev/null +++ b/tests/test.py @@ -0,0 +1,263 @@ +import sys +import os +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) +import unittest +import oimdp +from oimdp.structures import Age, BioOrEvent, Date, DictionaryUnit, Document, DoxographicalItem, Editorial, Hemistich, Hukm, Isnad, Line, Matn, Milestone, MorphologicalPattern, NamedEntity, OpenTagAuto, OpenTagUser, PageNumber, Paragraph, Riwayat, RouteDist, RouteFrom, RouteOrDistance, RouteTowa, SectionHeader, TextPart, Verse + + +class TestStringMethods(unittest.TestCase): + + def __init__(self, *args, **kwargs): + super(TestStringMethods, self).__init__(*args, **kwargs) + root = os.path.dirname(__file__) + filepath = os.path.join( + root, "test.md" + ) + test_file = open(filepath, "r") + self.text = test_file.read() + test_file.close() + self.parsed = oimdp.parse(self.text) + + def generic_check(self, datatype, location: int, type: str, property: str = ""): + content = self.parsed.content[location] + self.assertTrue(isinstance(content, datatype)) + if (len(property) > 0): + self.assertEqual(getattr(content, property), type) + + def test_magic(self): + self.assertEqual(str(self.parsed.magic_value), "######OpenITI#") + + def test_meta(self): + self.assertEqual(str(self.parsed.simple_metadata[1]), + "000.SortField :: Shamela_0023833") + self.assertEqual(str(self.parsed.simple_metadata[-1]), + "999.MiscINFO :: NODATA") + + def test_document(self): + self.assertTrue(isinstance(self.parsed, Document)) + + def test_page(self): + self.assertTrue(isinstance(self.parsed.content[1].parts[1], + PageNumber)) + self.assertEqual(str(self.parsed.content[1].parts[1]), + "Vol. 00, p. 000") + + def test_bio_or_event(self): + def check(location: int, type: str): + self.generic_check(BioOrEvent, location, type, "be_type") + + check(2, "man") + self.assertEqual(str(self.parsed.content[3].parts[0]), + " أبو عمرو ابن العلاء واسمه") + check(8, "man") + self.assertEqual(str(self.parsed.content[9].parts[0]), + " أبو عمرو ابن العلاء واسمه") + check(14, "wom") + self.assertEqual(str(self.parsed.content[15].parts[0]), + " 1729 - صمعة بنت أحمد بن محمد بن عبيد الله الرئيس النيسابورية من ولد عثمان بن") + check(17, "wom") + self.assertEqual(str(self.parsed.content[18].parts[0]), + " 1729 - صمعة بنت أحمد بن محمد بن عبيد الله الرئيس النيسابورية من ولد عثمان بن") + check(20, "ref") + self.assertEqual(str(self.parsed.content[21].parts[0]), + " [a cross-reference, for both men and women]") + check(23, "ref") + self.assertEqual(str(self.parsed.content[24].parts[0]), + " [a cross-reference, for both men and women]") + check(26, "names") + self.assertEqual(str(self.parsed.content[27].parts[0]), + " -وفيها ولد: (@)(@@) المحدث عفيف ") + check(29, "names") + self.assertEqual(str(self.parsed.content[30].parts[0]), + " -وفيها ولد: (@)(@@) المحدث عفيف ") + check(32, "events") + check(34, "event") + check(49, "man") + + def test_dictionary_unit(self): + def check(location: int, type: str): + self.generic_check(DictionaryUnit, location, type, "dic_type") + + check(36, "nis") + check(38, "top") + check(40, "lex") + check(42, "bib") + + def test_doxographical(self): + def check(location: int, type: str): + self.generic_check(DoxographicalItem, location, type, "dox_type") + + check(44, "pos") + check(46, "sec") + + def test_editorial(self): + self.assertTrue(isinstance(self.parsed.content[48], Editorial)) + + def test_morphological(self): + self.assertTrue(isinstance(self.parsed.content[50], MorphologicalPattern)) + self.assertTrue(self.parsed.content[50].category, "onomastic") + + def test_paragraph(self): + self.assertTrue(isinstance(self.parsed.content[51], Paragraph)) + + def test_line(self): + self.assertTrue(isinstance(self.parsed.content[52], Line)) + self.assertTrue(isinstance(self.parsed.content[53], Line)) + ## Check line parts on 53 + + def test_milestone(self): + self.assertTrue(isinstance(self.parsed.content[67], Line)) + self.assertTrue(isinstance(self.parsed.content[67].parts[1], Milestone)) + + def test_named_entities(self): + self.assertTrue(isinstance(self.parsed.content[53].parts[1], Date)) + self.assertEqual(self.parsed.content[53].parts[1].date_type, "death") + + self.assertTrue(isinstance(self.parsed.content[68].parts[1], Date)) + self.assertEqual(self.parsed.content[68].parts[1].date_type, "birth") + self.assertEqual(self.parsed.content[68].parts[1].value, "597") + + self.assertTrue(isinstance(self.parsed.content[69].parts[1], Date)) + self.assertEqual(self.parsed.content[69].parts[1].date_type, "other") + self.assertEqual(self.parsed.content[69].parts[1].value, "597") + + self.assertTrue(isinstance(self.parsed.content[70].parts[1], Age)) + self.assertEqual(self.parsed.content[70].parts[1].value, "059") + + self.assertTrue(isinstance(self.parsed.content[71].parts[1], NamedEntity)) + self.assertEqual(self.parsed.content[71].parts[1].ne_type, "soc") + self.assertEqual(self.parsed.content[71].parts[1].prefix, 0) + self.assertEqual(self.parsed.content[71].parts[1].extent, 2) + self.assertEqual(self.parsed.content[71].parts[1].text, 'معمر شيخ: ') + self.assertEqual(self.parsed.content[71].parts[2].text, 'واسط.. 1"018: نزيل: ') + + self.assertTrue(isinstance(self.parsed.content[72].parts[1], NamedEntity)) + self.assertEqual(self.parsed.content[72].parts[1].ne_type, "soc") + self.assertEqual(self.parsed.content[72].parts[1].prefix, 1) + self.assertEqual(self.parsed.content[72].parts[1].extent, 3) + + self.assertTrue(isinstance(self.parsed.content[73].parts[1], NamedEntity)) + self.assertEqual(self.parsed.content[73].parts[1].ne_type, "top") + self.assertEqual(self.parsed.content[73].parts[1].prefix, 0) + self.assertEqual(self.parsed.content[73].parts[1].extent, 2) + + self.assertTrue(isinstance(self.parsed.content[74].parts[1], NamedEntity)) + self.assertEqual(self.parsed.content[74].parts[1].ne_type, "top") + self.assertEqual(self.parsed.content[74].parts[1].prefix, 1) + self.assertEqual(self.parsed.content[74].parts[1].extent, 3) + + self.assertTrue(isinstance(self.parsed.content[75].parts[1], NamedEntity)) + self.assertEqual(self.parsed.content[75].parts[1].ne_type, "per") + self.assertEqual(self.parsed.content[75].parts[1].prefix, 0) + self.assertEqual(self.parsed.content[75].parts[1].extent, 2) + + self.assertTrue(isinstance(self.parsed.content[76].parts[1], NamedEntity)) + self.assertEqual(self.parsed.content[76].parts[1].ne_type, "per") + self.assertEqual(self.parsed.content[76].parts[1].prefix, 1) + self.assertEqual(self.parsed.content[76].parts[1].extent, 3) + + self.assertTrue(isinstance(self.parsed.content[77].parts[1], NamedEntity)) + self.assertEqual(self.parsed.content[77].parts[1].ne_type, "src") + self.assertEqual(self.parsed.content[77].parts[1].prefix, 0) + self.assertEqual(self.parsed.content[77].parts[1].extent, 3) + + def test_opentags(self): + self.assertTrue(isinstance(self.parsed.content[79].parts[1], OpenTagUser)) + self.assertEqual(self.parsed.content[79].parts[1].user, "USER") + self.assertEqual(self.parsed.content[79].parts[1].t_type, "CAT") + self.assertEqual(self.parsed.content[79].parts[1].t_subtype, "SUBCAT") + self.assertEqual(self.parsed.content[79].parts[1].t_subsubtype, "SUBSUBCAT") + + def test_opentagsauto(self): + self.assertTrue(isinstance(self.parsed.content[81].parts[1], OpenTagAuto)) + self.assertEqual(self.parsed.content[81].parts[1].resp, "RES") + self.assertEqual(self.parsed.content[81].parts[1].t_type, "TYPE") + self.assertEqual(self.parsed.content[81].parts[1].category, "Category") + self.assertEqual(self.parsed.content[81].parts[1].review, "fr") + + def test_riwayat(self): + self.assertTrue(isinstance(self.parsed.content[54], Riwayat)) + self.assertTrue(isinstance(self.parsed.content[55], Line)) + self.assertTrue(isinstance(self.parsed.content[55].parts[0], Isnad)) + self.assertTrue(isinstance(self.parsed.content[55].parts[1], TextPart)) + self.assertEqual(self.parsed.content[55].parts[1].orig, " this section contains isnād ") + + self.assertTrue(isinstance(self.parsed.content[55].parts[2], Matn)) + self.assertTrue(isinstance(self.parsed.content[55].parts[3], TextPart)) + self.assertEqual(self.parsed.content[55].parts[3].orig, " this section") + + self.assertTrue(isinstance(self.parsed.content[56], Line)) + self.assertTrue(isinstance(self.parsed.content[56].parts[0], TextPart)) + self.assertEqual(self.parsed.content[56].parts[0].orig, " contains matn ") + + self.assertTrue(isinstance(self.parsed.content[56].parts[1], Hukm)) + self.assertTrue(isinstance(self.parsed.content[56].parts[2], TextPart)) + self.assertEqual(self.parsed.content[56].parts[2].orig, " this section contains ḥukm .") + + def test_route_or_distance(self): + self.assertTrue(isinstance(self.parsed.content[57], RouteOrDistance)) + self.assertTrue(isinstance(self.parsed.content[57].parts[0], RouteFrom)) + self.assertTrue(isinstance(self.parsed.content[57].parts[1], TextPart)) + self.assertEqual(self.parsed.content[57].parts[1].orig, " toponym ") + + self.assertTrue(isinstance(self.parsed.content[57].parts[2], RouteTowa)) + self.assertTrue(isinstance(self.parsed.content[57].parts[3], TextPart)) + self.assertEqual(self.parsed.content[57].parts[3].orig, " toponym ") + + self.assertTrue(isinstance(self.parsed.content[57].parts[4], RouteDist)) + self.assertTrue(isinstance(self.parsed.content[57].parts[5], TextPart)) + self.assertEqual(self.parsed.content[57].parts[5].orig, " distance_as_recorded") + + def test_section_headers(self): + self.assertTrue(isinstance(self.parsed.content[58], SectionHeader)) + self.assertEqual(self.parsed.content[58].value, " ذكر سرد النسب الزكي من محمد صلى الله عليه وآله وسلم، إلى آدم عليه السلام") + self.assertEqual(self.parsed.content[58].level, 1) + + self.assertTrue(isinstance(self.parsed.content[59], SectionHeader)) + self.assertEqual(self.parsed.content[59].value, " (نهج ابن هشام في هذا الكتاب) :") + self.assertEqual(self.parsed.content[59].level, 2) + + self.assertTrue(isinstance(self.parsed.content[60], SectionHeader)) + self.assertEqual(self.parsed.content[60].value, " (نهج ابن هشام في هذا الكتاب) :") + self.assertEqual(self.parsed.content[60].level, 3) + + self.assertTrue(isinstance(self.parsed.content[61], SectionHeader)) + self.assertEqual(self.parsed.content[61].value, " (نهج ابن هشام في هذا الكتاب) :") + self.assertEqual(self.parsed.content[61].level, 4) + + self.assertTrue(isinstance(self.parsed.content[62], SectionHeader)) + self.assertEqual(self.parsed.content[62].value, " (نهج ابن هشام في هذا الكتاب) :") + self.assertEqual(self.parsed.content[62].level, 5) + + def test_verse(self): + self.assertTrue(isinstance(self.parsed.content[63], Verse)) + self.assertTrue(isinstance(self.parsed.content[63].parts[0], TextPart)) + self.assertEqual(self.parsed.content[63].parts[0].orig, " وجمع العرب تحت لواء الرسول محمد عليه الصلاة ") + + self.assertTrue(isinstance(self.parsed.content[63].parts[1], Hemistich)) + self.assertEqual(self.parsed.content[63].parts[1].orig, "%~%") + + self.assertTrue(isinstance(self.parsed.content[63].parts[2], TextPart)) + self.assertEqual(self.parsed.content[63].parts[2].orig, " والسلام، وما يضاف إلى ذلك من") + + self.assertTrue(isinstance(self.parsed.content[64], Verse)) + self.assertTrue(isinstance(self.parsed.content[64].parts[0], TextPart)) + self.assertEqual(self.parsed.content[64].parts[0].orig, " ") + self.assertTrue(isinstance(self.parsed.content[64].parts[1], Hemistich)) + self.assertEqual(self.parsed.content[64].parts[1].orig, "%~%") + self.assertTrue(isinstance(self.parsed.content[64].parts[2], TextPart)) + self.assertEqual(self.parsed.content[64].parts[2].orig, " وجمع العرب تحت لواء الرسول محمد عليه الصلاة والسلام، وما يضاف إلى ذلك من") + + self.assertTrue(isinstance(self.parsed.content[65], Verse)) + self.assertTrue(isinstance(self.parsed.content[65].parts[1], Hemistich)) + self.assertEqual(self.parsed.content[65].parts[1].orig, "%~%") + self.assertTrue(isinstance(self.parsed.content[65].parts[0], TextPart)) + self.assertEqual(self.parsed.content[65].parts[0].orig, " جمع العرب تحت لواء الرسول محمد عليه الصلاة والسلام، وما يضاف إلى ذلك من") + + # TODO: ADMINISTRATIVE REGIONS! + + +if __name__ == "__main__": + unittest.main()