diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index e0c6139c0..8b1c5c104 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -27,6 +27,7 @@ MarkdownTextSplitter, PythonCodeTextSplitter, RecursiveCharacterTextSplitter, + RecursiveJsonSplitter, ) from langchain_community.vectorstores import FAISS @@ -83,7 +84,7 @@ def __init__(self, *args, **kwargs): action="store", default=DEFAULT_CHUNK_SIZE, type=int, - help="Max number of characters in chunk", + help="Maximum number of characters per chunk. This argument is not supported on JSON files.", ) self.parser.add_argument( "-o", @@ -91,7 +92,7 @@ def __init__(self, *args, **kwargs): action="store", default=DEFAULT_CHUNK_OVERLAP, type=int, - help="Number of characters overlapping between chunks, helpful to ensure text is not split mid-word or mid-sentence", + help="Number of characters that may overlap between chunks, which can help ensure a document is not split mid-word or mid-sentence. This argument is not supported on JSON files.", ) self.parser.add_argument("path", nargs=argparse.REMAINDER) self.index_name = "default" @@ -240,6 +241,7 @@ async def learn_dir( ".md": MarkdownTextSplitter(**splitter_kwargs), ".tex": LatexTextSplitter(**splitter_kwargs), ".ipynb": NotebookSplitter(**splitter_kwargs), + ".json": RecursiveJsonSplitter(), } splitter = ExtensionSplitter( splitters=splitters,