@@ -41,17 +41,25 @@ def __init__(self, chunk_size_limit: int = 3000) -> None:
4141 self .chunk_size_limit = chunk_size_limit
4242 self .keys = set ()
4343
44- def add (self , path : str , citation : str , key : Optional [str ] = None ) -> bool :
44+ def add (
45+ self ,
46+ path : str ,
47+ citation : str ,
48+ key : Optional [str ] = None ,
49+ disable_check : bool = False ,
50+ ) -> None :
4551 """Add a document to the collection."""
4652 if path in self .docs :
47- return False
53+ raise ValueError ( f"Document { path } already in collection." )
4854 if key is None :
4955 # get first name and year from citation
5056 try :
5157 author = re .search (r"([A-Z][a-z]+)" , citation ).group (1 )
5258 except AttributeError :
5359 # panicking - no word??
54- return False
60+ raise ValueError (
61+ f"Could not parse author from citation { citation } . Consider just passing key explicitly"
62+ )
5563 try :
5664 year = re .search (r"(\d{4})" , citation ).group (1 )
5765 except AttributeError :
@@ -70,18 +78,20 @@ def add(self, path: str, citation: str, key: Optional[str] = None) -> bool:
7078 data = {"citation" : citation , "key" : key }
7179 d = gpt_index .SimpleDirectoryReader (input_files = [path ]).load_data ()
7280 # loose check to see if document was loaded
73- if not maybe_is_text (d [0 ].text ):
74- return False
81+ if not disable_check and not maybe_is_text (d [0 ].text ):
82+ raise ValueError (
83+ f"This does not look like a text document: { path } . Path disable_check to ignore this error."
84+ )
7585 with HiddenPrints ():
7686 try :
7787 i = gpt_index .GPTSimpleVectorIndex (
7888 d , chunk_size_limit = self .chunk_size_limit
7989 )
8090 except UnicodeEncodeError :
81- return False
91+ # want to make this a valueerror so we can catch it
92+ raise ValueError (f"Failed to load document { path } ." )
8293 data ["index" ] = i
8394 self .docs [path ] = data
84- return True
8595
8696 # to pickle, we have to save the index as a file
8797 def __getstate__ (self ):
0 commit comments