Goodly · stevenelleman · Sep 18, 2017 · Oct 12, 2017 · Oct 12, 2017 · Oct 12, 2017
diff --git a/data/parse_document.py b/data/parse_document.py
@@ -97,11 +97,53 @@ def parse_article(raw_text, filename):
         raise ArticleParseError("Only found Useless tuas!",
                                 ArticleParseError.DUPLICATE_ERROR)
 
-    # Warning: brackets left over are usually bad news.
-    if '[' in clean_text or ']' in clean_text:
-        print "Unparsed brackets left in article:", article_number
-#        raise ArticleParseError("Brackets remain in clean text!",
-#                                ArticleParseError.BRACKET_WARNING)
+    #Trace highlights
+    index = 0
+    highlight_open = False
+    highlights = []
+
+    while index < len(clean_text):
+
+        #Assume that highlights cannot have internal square brackets... throw error.
+        if (clean_text[index] == "[" and highlight_open):
+            raise Exception("Extra [ in " + filename + " article number " + article_number + "\n")
+
+        #Assume that right square bracket without its corresponding left square bracket is an error.
+        elif (clean_text[index] == "]" and not highlight_open):
+            raise Exception("Extra ] in " + filename + " article number " + article_number + "\n")
+
+        #Start of highlight.
+        elif clean_text[index] == "[":
+            highlight_open = True
+            start = index
+
+        #Close of highlight... Remove highlight from text.
+        elif clean_text[index] == "]" and highlight_open:
+            highlight = {'start': start, 'end': index, 'text': clean_text[start:index+1]}
+            clean_text = clean_text[0:start] + clean_text[index+1:]
+            index -= index+1 - start
+            highlights.append(highlight)
+            highlight_open = False
+        index += 1
+
+    #Find highlight offsets with new clean doc
+    offsets = []
+    for highlight in highlights:
+        text = highlight['text'][1:-1].strip()
+        text_length = len(text)
+
+        start_index = clean_text.find(text)
+        if start_index == -1:
+            raise Exception("Highlight '" + text + "' not recognized in " + filename + "\n")
+        end_index = start_index + text_length
+        offsets.append([start_index, end_index])
+
+    #Can I delete below the comments below?
+    #if '[' in clean_text or ']' in clean_text:
+    #    print "Unparsed brackets left in article:", article_number
+    #        raise ArticleParseError("Brackets remain in clean text!",
+    #                                ArticleParseError.BRACKET_WARNING)
+    #
 
     # print out our data.
     # TODO: store this somewhere.
@@ -115,6 +157,7 @@ def parse_article(raw_text, filename):
         'periodical': periodical,
         'periodical_code': periodical_code,
         'filename': filename,
+        'highlight_offsets': offsets,
     }
     return {
         'metadata': metadata,

diff --git a/data/pybossa_api.py b/data/pybossa_api.py
@@ -36,6 +36,9 @@ class ImproperConfigForRemote(Exception):
 class InvalidTaskRun(Exception):
     pass
 
+class DecidingForceParserError(Exception):
+    pass
+
 @django_rq.job('task_exporter', timeout=60, result_ttl=24*3600)
 def create_or_update_remote_project_worker(project_id,
                                            debug_presenter=False,