diff --git a/.venv2/Include/site/python3.12/greenlet/greenlet.h b/.venv2/Include/site/python3.12/greenlet/greenlet.h
new file mode 100644
index 0000000..d02a16e
--- /dev/null
+++ b/.venv2/Include/site/python3.12/greenlet/greenlet.h
@@ -0,0 +1,164 @@
+/* -*- indent-tabs-mode: nil; tab-width: 4; -*- */
+
+/* Greenlet object interface */
+
+#ifndef Py_GREENLETOBJECT_H
+#define Py_GREENLETOBJECT_H
+
+
+#include <Python.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This is deprecated and undocumented. It does not change. */
+#define GREENLET_VERSION "1.0.0"
+
+#ifndef GREENLET_MODULE
+#define implementation_ptr_t void*
+#endif
+
+typedef struct _greenlet {
+    PyObject_HEAD
+    PyObject* weakreflist;
+    PyObject* dict;
+    implementation_ptr_t pimpl;
+} PyGreenlet;
+
+#define PyGreenlet_Check(op) (op && PyObject_TypeCheck(op, &PyGreenlet_Type))
+
+
+/* C API functions */
+
+/* Total number of symbols that are exported */
+#define PyGreenlet_API_pointers 12
+
+#define PyGreenlet_Type_NUM 0
+#define PyExc_GreenletError_NUM 1
+#define PyExc_GreenletExit_NUM 2
+
+#define PyGreenlet_New_NUM 3
+#define PyGreenlet_GetCurrent_NUM 4
+#define PyGreenlet_Throw_NUM 5
+#define PyGreenlet_Switch_NUM 6
+#define PyGreenlet_SetParent_NUM 7
+
+#define PyGreenlet_MAIN_NUM 8
+#define PyGreenlet_STARTED_NUM 9
+#define PyGreenlet_ACTIVE_NUM 10
+#define PyGreenlet_GET_PARENT_NUM 11
+
+#ifndef GREENLET_MODULE
+/* This section is used by modules that uses the greenlet C API */
+static void** _PyGreenlet_API = NULL;
+
+#    define PyGreenlet_Type \
+        (*(PyTypeObject*)_PyGreenlet_API[PyGreenlet_Type_NUM])
+
+#    define PyExc_GreenletError \
+        ((PyObject*)_PyGreenlet_API[PyExc_GreenletError_NUM])
+
+#    define PyExc_GreenletExit \
+        ((PyObject*)_PyGreenlet_API[PyExc_GreenletExit_NUM])
+
+/*
+ * PyGreenlet_New(PyObject *args)
+ *
+ * greenlet.greenlet(run, parent=None)
+ */
+#    define PyGreenlet_New                                        \
+        (*(PyGreenlet * (*)(PyObject * run, PyGreenlet * parent)) \
+             _PyGreenlet_API[PyGreenlet_New_NUM])
+
+/*
+ * PyGreenlet_GetCurrent(void)
+ *
+ * greenlet.getcurrent()
+ */
+#    define PyGreenlet_GetCurrent \
+        (*(PyGreenlet * (*)(void)) _PyGreenlet_API[PyGreenlet_GetCurrent_NUM])
+
+/*
+ * PyGreenlet_Throw(
+ *         PyGreenlet *greenlet,
+ *         PyObject *typ,
+ *         PyObject *val,
+ *         PyObject *tb)
+ *
+ * g.throw(...)
+ */
+#    define PyGreenlet_Throw                 \
+        (*(PyObject * (*)(PyGreenlet * self, \
+                          PyObject * typ,    \
+                          PyObject * val,    \
+                          PyObject * tb))    \
+             _PyGreenlet_API[PyGreenlet_Throw_NUM])
+
+/*
+ * PyGreenlet_Switch(PyGreenlet *greenlet, PyObject *args)
+ *
+ * g.switch(*args, **kwargs)
+ */
+#    define PyGreenlet_Switch                                              \
+        (*(PyObject *                                                      \
+           (*)(PyGreenlet * greenlet, PyObject * args, PyObject * kwargs)) \
+             _PyGreenlet_API[PyGreenlet_Switch_NUM])
+
+/*
+ * PyGreenlet_SetParent(PyObject *greenlet, PyObject *new_parent)
+ *
+ * g.parent = new_parent
+ */
+#    define PyGreenlet_SetParent                                 \
+        (*(int (*)(PyGreenlet * greenlet, PyGreenlet * nparent)) \
+             _PyGreenlet_API[PyGreenlet_SetParent_NUM])
+
+/*
+ * PyGreenlet_GetParent(PyObject* greenlet)
+ *
+ * return greenlet.parent;
+ *
+ * This could return NULL even if there is no exception active.
+ * If it does not return NULL, you are responsible for decrementing the
+ * reference count.
+ */
+#     define PyGreenlet_GetParent                                    \
+    (*(PyGreenlet* (*)(PyGreenlet*))                                 \
+     _PyGreenlet_API[PyGreenlet_GET_PARENT_NUM])
+
+/*
+ * deprecated, undocumented alias.
+ */
+#     define PyGreenlet_GET_PARENT PyGreenlet_GetParent
+
+#     define PyGreenlet_MAIN                                         \
+    (*(int (*)(PyGreenlet*))                                         \
+     _PyGreenlet_API[PyGreenlet_MAIN_NUM])
+
+#     define PyGreenlet_STARTED                                      \
+    (*(int (*)(PyGreenlet*))                                         \
+     _PyGreenlet_API[PyGreenlet_STARTED_NUM])
+
+#     define PyGreenlet_ACTIVE                                       \
+    (*(int (*)(PyGreenlet*))                                         \
+     _PyGreenlet_API[PyGreenlet_ACTIVE_NUM])
+
+
+
+
+/* Macro that imports greenlet and initializes C API */
+/* NOTE: This has actually moved to ``greenlet._greenlet._C_API``, but we
+   keep the older definition to be sure older code that might have a copy of
+   the header still works. */
+#    define PyGreenlet_Import()                                               \
+        {                                                                     \
+            _PyGreenlet_API = (void**)PyCapsule_Import("greenlet._C_API", 0); \
+        }
+
+#endif /* GREENLET_MODULE */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_GREENLETOBJECT_H */
diff --git a/.venv2/Scripts/Activate.ps1 b/.venv2/Scripts/Activate.ps1
new file mode 100644
index 0000000..b49d77b
--- /dev/null
+++ b/.venv2/Scripts/Activate.ps1
@@ -0,0 +1,247 @@
+<#
+.Synopsis
+Activate a Python virtual environment for the current PowerShell session.
+
+.Description
+Pushes the python executable for a virtual environment to the front of the
+$Env:PATH environment variable and sets the prompt to signify that you are
+in a Python virtual environment. Makes use of the command line switches as
+well as the `pyvenv.cfg` file values present in the virtual environment.
+
+.Parameter VenvDir
+Path to the directory that contains the virtual environment to activate. The
+default value for this is the parent of the directory that the Activate.ps1
+script is located within.
+
+.Parameter Prompt
+The prompt prefix to display when this virtual environment is activated. By
+default, this prompt is the name of the virtual environment folder (VenvDir)
+surrounded by parentheses and followed by a single space (ie. '(.venv) ').
+
+.Example
+Activate.ps1
+Activates the Python virtual environment that contains the Activate.ps1 script.
+
+.Example
+Activate.ps1 -Verbose
+Activates the Python virtual environment that contains the Activate.ps1 script,
+and shows extra information about the activation as it executes.
+
+.Example
+Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
+Activates the Python virtual environment located in the specified location.
+
+.Example
+Activate.ps1 -Prompt "MyPython"
+Activates the Python virtual environment that contains the Activate.ps1 script,
+and prefixes the current prompt with the specified string (surrounded in
+parentheses) while the virtual environment is active.
+
+.Notes
+On Windows, it may be required to enable this Activate.ps1 script by setting the
+execution policy for the user. You can do this by issuing the following PowerShell
+command:
+
+PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
+
+For more information on Execution Policies: 
+https://go.microsoft.com/fwlink/?LinkID=135170
+
+#>
+Param(
+    [Parameter(Mandatory = $false)]
+    [String]
+    $VenvDir,
+    [Parameter(Mandatory = $false)]
+    [String]
+    $Prompt
+)
+
+<# Function declarations --------------------------------------------------- #>
+
+<#
+.Synopsis
+Remove all shell session elements added by the Activate script, including the
+addition of the virtual environment's Python executable from the beginning of
+the PATH variable.
+
+.Parameter NonDestructive
+If present, do not remove this function from the global namespace for the
+session.
+
+#>
+function global:deactivate ([switch]$NonDestructive) {
+    # Revert to original values
+
+    # The prior prompt:
+    if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
+        Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
+        Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
+    }
+
+    # The prior PYTHONHOME:
+    if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
+        Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
+        Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
+    }
+
+    # The prior PATH:
+    if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
+        Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
+        Remove-Item -Path Env:_OLD_VIRTUAL_PATH
+    }
+
+    # Just remove the VIRTUAL_ENV altogether:
+    if (Test-Path -Path Env:VIRTUAL_ENV) {
+        Remove-Item -Path env:VIRTUAL_ENV
+    }
+
+    # Just remove VIRTUAL_ENV_PROMPT altogether.
+    if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
+        Remove-Item -Path env:VIRTUAL_ENV_PROMPT
+    }
+
+    # Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
+    if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
+        Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
+    }
+
+    # Leave deactivate function in the global namespace if requested:
+    if (-not $NonDestructive) {
+        Remove-Item -Path function:deactivate
+    }
+}
+
+<#
+.Description
+Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
+given folder, and returns them in a map.
+
+For each line in the pyvenv.cfg file, if that line can be parsed into exactly
+two strings separated by `=` (with any amount of whitespace surrounding the =)
+then it is considered a `key = value` line. The left hand string is the key,
+the right hand is the value.
+
+If the value starts with a `'` or a `"` then the first and last character is
+stripped from the value before being captured.
+
+.Parameter ConfigDir
+Path to the directory that contains the `pyvenv.cfg` file.
+#>
+function Get-PyVenvConfig(
+    [String]
+    $ConfigDir
+) {
+    Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
+
+    # Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
+    $pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
+
+    # An empty map will be returned if no config file is found.
+    $pyvenvConfig = @{ }
+
+    if ($pyvenvConfigPath) {
+
+        Write-Verbose "File exists, parse `key = value` lines"
+        $pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
+
+        $pyvenvConfigContent | ForEach-Object {
+            $keyval = $PSItem -split "\s*=\s*", 2
+            if ($keyval[0] -and $keyval[1]) {
+                $val = $keyval[1]
+
+                # Remove extraneous quotations around a string value.
+                if ("'""".Contains($val.Substring(0, 1))) {
+                    $val = $val.Substring(1, $val.Length - 2)
+                }
+
+                $pyvenvConfig[$keyval[0]] = $val
+                Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
+            }
+        }
+    }
+    return $pyvenvConfig
+}
+
+
+<# Begin Activate script --------------------------------------------------- #>
+
+# Determine the containing directory of this script
+$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
+$VenvExecDir = Get-Item -Path $VenvExecPath
+
+Write-Verbose "Activation script is located in path: '$VenvExecPath'"
+Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
+Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
+
+# Set values required in priority: CmdLine, ConfigFile, Default
+# First, get the location of the virtual environment, it might not be
+# VenvExecDir if specified on the command line.
+if ($VenvDir) {
+    Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
+}
+else {
+    Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
+    $VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
+    Write-Verbose "VenvDir=$VenvDir"
+}
+
+# Next, read the `pyvenv.cfg` file to determine any required value such
+# as `prompt`.
+$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
+
+# Next, set the prompt from the command line, or the config file, or
+# just use the name of the virtual environment folder.
+if ($Prompt) {
+    Write-Verbose "Prompt specified as argument, using '$Prompt'"
+}
+else {
+    Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
+    if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
+        Write-Verbose "  Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
+        $Prompt = $pyvenvCfg['prompt'];
+    }
+    else {
+        Write-Verbose "  Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
+        Write-Verbose "  Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
+        $Prompt = Split-Path -Path $venvDir -Leaf
+    }
+}
+
+Write-Verbose "Prompt = '$Prompt'"
+Write-Verbose "VenvDir='$VenvDir'"
+
+# Deactivate any currently active virtual environment, but leave the
+# deactivate function in place.
+deactivate -nondestructive
+
+# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
+# that there is an activated venv.
+$env:VIRTUAL_ENV = $VenvDir
+
+if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
+
+    Write-Verbose "Setting prompt to '$Prompt'"
+
+    # Set the prompt to include the env name
+    # Make sure _OLD_VIRTUAL_PROMPT is global
+    function global:_OLD_VIRTUAL_PROMPT { "" }
+    Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
+    New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
+
+    function global:prompt {
+        Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
+        _OLD_VIRTUAL_PROMPT
+    }
+    $env:VIRTUAL_ENV_PROMPT = $Prompt
+}
+
+# Clear PYTHONHOME
+if (Test-Path -Path Env:PYTHONHOME) {
+    Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
+    Remove-Item -Path Env:PYTHONHOME
+}
+
+# Add the venv to the PATH
+Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
+$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"
diff --git a/.venv2/Scripts/activate b/.venv2/Scripts/activate
new file mode 100644
index 0000000..db11ccd
--- /dev/null
+++ b/.venv2/Scripts/activate
@@ -0,0 +1,70 @@
+# This file must be used with "source bin/activate" *from bash*
+# You cannot run it directly
+
+deactivate () {
+    # reset old environment variables
+    if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
+        PATH="${_OLD_VIRTUAL_PATH:-}"
+        export PATH
+        unset _OLD_VIRTUAL_PATH
+    fi
+    if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
+        PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
+        export PYTHONHOME
+        unset _OLD_VIRTUAL_PYTHONHOME
+    fi
+
+    # Call hash to forget past commands. Without forgetting
+    # past commands the $PATH changes we made may not be respected
+    hash -r 2> /dev/null
+
+    if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
+        PS1="${_OLD_VIRTUAL_PS1:-}"
+        export PS1
+        unset _OLD_VIRTUAL_PS1
+    fi
+
+    unset VIRTUAL_ENV
+    unset VIRTUAL_ENV_PROMPT
+    if [ ! "${1:-}" = "nondestructive" ] ; then
+    # Self destruct!
+        unset -f deactivate
+    fi
+}
+
+# unset irrelevant variables
+deactivate nondestructive
+
+# on Windows, a path can contain colons and backslashes and has to be converted:
+if [ "${OSTYPE:-}" = "cygwin" ] || [ "${OSTYPE:-}" = "msys" ] ; then
+    # transform D:\path\to\venv to /d/path/to/venv on MSYS
+    # and to /cygdrive/d/path/to/venv on Cygwin
+    export VIRTUAL_ENV=$(cygpath "C:\Users\Kevin\Documents\Work\SCP\AI\RAG\appl-kgraph\.venv2")
+else
+    # use the path as-is
+    export VIRTUAL_ENV="C:\Users\Kevin\Documents\Work\SCP\AI\RAG\appl-kgraph\.venv2"
+fi
+
+_OLD_VIRTUAL_PATH="$PATH"
+PATH="$VIRTUAL_ENV/Scripts:$PATH"
+export PATH
+
+# unset PYTHONHOME if set
+# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
+# could use `if (set -u; : $PYTHONHOME) ;` in bash
+if [ -n "${PYTHONHOME:-}" ] ; then
+    _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
+    unset PYTHONHOME
+fi
+
+if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
+    _OLD_VIRTUAL_PS1="${PS1:-}"
+    PS1="(.venv2) ${PS1:-}"
+    export PS1
+    VIRTUAL_ENV_PROMPT="(.venv2) "
+    export VIRTUAL_ENV_PROMPT
+fi
+
+# Call hash to forget past commands. Without forgetting
+# past commands the $PATH changes we made may not be respected
+hash -r 2> /dev/null
diff --git a/.venv2/Scripts/activate.bat b/.venv2/Scripts/activate.bat
new file mode 100644
index 0000000..9dd51fc
--- /dev/null
+++ b/.venv2/Scripts/activate.bat
@@ -0,0 +1,34 @@
+@echo off
+
+rem This file is UTF-8 encoded, so we need to update the current code page while executing it
+for /f "tokens=2 delims=:." %%a in ('"%SystemRoot%\System32\chcp.com"') do (
+    set _OLD_CODEPAGE=%%a
+)
+if defined _OLD_CODEPAGE (
+    "%SystemRoot%\System32\chcp.com" 65001 > nul
+)
+
+set VIRTUAL_ENV=C:\Users\Kevin\Documents\Work\SCP\AI\RAG\appl-kgraph\.venv2
+
+if not defined PROMPT set PROMPT=$P$G
+
+if defined _OLD_VIRTUAL_PROMPT set PROMPT=%_OLD_VIRTUAL_PROMPT%
+if defined _OLD_VIRTUAL_PYTHONHOME set PYTHONHOME=%_OLD_VIRTUAL_PYTHONHOME%
+
+set _OLD_VIRTUAL_PROMPT=%PROMPT%
+set PROMPT=(.venv2) %PROMPT%
+
+if defined PYTHONHOME set _OLD_VIRTUAL_PYTHONHOME=%PYTHONHOME%
+set PYTHONHOME=
+
+if defined _OLD_VIRTUAL_PATH set PATH=%_OLD_VIRTUAL_PATH%
+if not defined _OLD_VIRTUAL_PATH set _OLD_VIRTUAL_PATH=%PATH%
+
+set PATH=%VIRTUAL_ENV%\Scripts;%PATH%
+set VIRTUAL_ENV_PROMPT=(.venv2) 
+
+:END
+if defined _OLD_CODEPAGE (
+    "%SystemRoot%\System32\chcp.com" %_OLD_CODEPAGE% > nul
+    set _OLD_CODEPAGE=
+)
diff --git a/.venv2/Scripts/chroma.exe b/.venv2/Scripts/chroma.exe
new file mode 100644
index 0000000..43ccf99
Binary files /dev/null and b/.venv2/Scripts/chroma.exe differ
diff --git a/.venv2/Scripts/coloredlogs.exe b/.venv2/Scripts/coloredlogs.exe
new file mode 100644
index 0000000..ef79943
Binary files /dev/null and b/.venv2/Scripts/coloredlogs.exe differ
diff --git a/.venv2/Scripts/deactivate.bat b/.venv2/Scripts/deactivate.bat
new file mode 100644
index 0000000..44dae49
--- /dev/null
+++ b/.venv2/Scripts/deactivate.bat
@@ -0,0 +1,22 @@
+@echo off
+
+if defined _OLD_VIRTUAL_PROMPT (
+    set "PROMPT=%_OLD_VIRTUAL_PROMPT%"
+)
+set _OLD_VIRTUAL_PROMPT=
+
+if defined _OLD_VIRTUAL_PYTHONHOME (
+    set "PYTHONHOME=%_OLD_VIRTUAL_PYTHONHOME%"
+    set _OLD_VIRTUAL_PYTHONHOME=
+)
+
+if defined _OLD_VIRTUAL_PATH (
+    set "PATH=%_OLD_VIRTUAL_PATH%"
+)
+
+set _OLD_VIRTUAL_PATH=
+
+set VIRTUAL_ENV=
+set VIRTUAL_ENV_PROMPT=
+
+:END
diff --git a/.venv2/Scripts/distro.exe b/.venv2/Scripts/distro.exe
new file mode 100644
index 0000000..00cde8d
Binary files /dev/null and b/.venv2/Scripts/distro.exe differ
diff --git a/.venv2/Scripts/docx2pdf.exe b/.venv2/Scripts/docx2pdf.exe
new file mode 100644
index 0000000..ffa497a
Binary files /dev/null and b/.venv2/Scripts/docx2pdf.exe differ
diff --git a/.venv2/Scripts/dotenv.exe b/.venv2/Scripts/dotenv.exe
new file mode 100644
index 0000000..fbd2c43
Binary files /dev/null and b/.venv2/Scripts/dotenv.exe differ
diff --git a/.venv2/Scripts/f2py.exe b/.venv2/Scripts/f2py.exe
new file mode 100644
index 0000000..0003352
Binary files /dev/null and b/.venv2/Scripts/f2py.exe differ
diff --git a/.venv2/Scripts/fastapi.exe b/.venv2/Scripts/fastapi.exe
new file mode 100644
index 0000000..a295d41
Binary files /dev/null and b/.venv2/Scripts/fastapi.exe differ
diff --git a/.venv2/Scripts/fonttools.exe b/.venv2/Scripts/fonttools.exe
new file mode 100644
index 0000000..6ed0f78
Binary files /dev/null and b/.venv2/Scripts/fonttools.exe differ
diff --git a/.venv2/Scripts/gradio.exe b/.venv2/Scripts/gradio.exe
new file mode 100644
index 0000000..c81eef3
Binary files /dev/null and b/.venv2/Scripts/gradio.exe differ
diff --git a/.venv2/Scripts/hf.exe b/.venv2/Scripts/hf.exe
new file mode 100644
index 0000000..6a410a8
Binary files /dev/null and b/.venv2/Scripts/hf.exe differ
diff --git a/.venv2/Scripts/httpx.exe b/.venv2/Scripts/httpx.exe
new file mode 100644
index 0000000..3556585
Binary files /dev/null and b/.venv2/Scripts/httpx.exe differ
diff --git a/.venv2/Scripts/huggingface-cli.exe b/.venv2/Scripts/huggingface-cli.exe
new file mode 100644
index 0000000..03f13c0
Binary files /dev/null and b/.venv2/Scripts/huggingface-cli.exe differ
diff --git a/.venv2/Scripts/humanfriendly.exe b/.venv2/Scripts/humanfriendly.exe
new file mode 100644
index 0000000..746633e
Binary files /dev/null and b/.venv2/Scripts/humanfriendly.exe differ
diff --git a/.venv2/Scripts/ipython.exe b/.venv2/Scripts/ipython.exe
new file mode 100644
index 0000000..c204ca2
Binary files /dev/null and b/.venv2/Scripts/ipython.exe differ
diff --git a/.venv2/Scripts/ipython3.exe b/.venv2/Scripts/ipython3.exe
new file mode 100644
index 0000000..c204ca2
Binary files /dev/null and b/.venv2/Scripts/ipython3.exe differ
diff --git a/.venv2/Scripts/isympy.exe b/.venv2/Scripts/isympy.exe
new file mode 100644
index 0000000..ca9065e
Binary files /dev/null and b/.venv2/Scripts/isympy.exe differ
diff --git a/.venv2/Scripts/jsondiff b/.venv2/Scripts/jsondiff
new file mode 100644
index 0000000..cb80f7f
--- /dev/null
+++ b/.venv2/Scripts/jsondiff
@@ -0,0 +1,41 @@
+#!C:\Users\Kevin\Documents\Work\SCP\AI\RAG\appl-kgraph\.venv2\Scripts\python.exe
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+import sys
+import json
+import jsonpatch
+import argparse
+
+
+parser = argparse.ArgumentParser(description='Diff two JSON files')
+parser.add_argument('FILE1', type=argparse.FileType('r'))
+parser.add_argument('FILE2', type=argparse.FileType('r'))
+parser.add_argument('--indent', type=int, default=None,
+                    help='Indent output by n spaces')
+parser.add_argument('-u', '--preserve-unicode', action='store_true',
+                    help='Output Unicode character as-is without using Code Point')
+parser.add_argument('-v', '--version', action='version',
+                    version='%(prog)s ' + jsonpatch.__version__)
+
+
+def main():
+    try:
+        diff_files()
+    except KeyboardInterrupt:
+        sys.exit(1)
+
+
+def diff_files():
+    """ Diffs two JSON files and prints a patch """
+    args = parser.parse_args()
+    doc1 = json.load(args.FILE1)
+    doc2 = json.load(args.FILE2)
+    patch = jsonpatch.make_patch(doc1, doc2)
+    if patch.patch:
+        print(json.dumps(patch.patch, indent=args.indent, ensure_ascii=not(args.preserve_unicode)))
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/.venv2/Scripts/jsonpatch b/.venv2/Scripts/jsonpatch
new file mode 100644
index 0000000..e528717
--- /dev/null
+++ b/.venv2/Scripts/jsonpatch
@@ -0,0 +1,107 @@
+#!C:\Users\Kevin\Documents\Work\SCP\AI\RAG\appl-kgraph\.venv2\Scripts\python.exe
+# -*- coding: utf-8 -*-
+
+import sys
+import os.path
+import json
+import jsonpatch
+import tempfile
+import argparse
+
+
+parser = argparse.ArgumentParser(
+    description='Apply a JSON patch on a JSON file')
+parser.add_argument('ORIGINAL', type=argparse.FileType('r'),
+                    help='Original file')
+parser.add_argument('PATCH', type=argparse.FileType('r'),
+                    nargs='?', default=sys.stdin,
+                    help='Patch file (read from stdin if omitted)')
+parser.add_argument('--indent', type=int, default=None,
+                    help='Indent output by n spaces')
+parser.add_argument('-b', '--backup', action='store_true',
+                    help='Back up ORIGINAL if modifying in-place')
+parser.add_argument('-i', '--in-place', action='store_true',
+                    help='Modify ORIGINAL in-place instead of to stdout')
+parser.add_argument('-v', '--version', action='version',
+                    version='%(prog)s ' + jsonpatch.__version__)
+parser.add_argument('-u', '--preserve-unicode', action='store_true',
+                    help='Output Unicode character as-is without using Code Point')
+
+def main():
+    try:
+        patch_files()
+    except KeyboardInterrupt:
+        sys.exit(1)
+
+
+def patch_files():
+    """ Diffs two JSON files and prints a patch """
+    args = parser.parse_args()
+    doc = json.load(args.ORIGINAL)
+    patch = json.load(args.PATCH)
+    result = jsonpatch.apply_patch(doc, patch)
+
+    if args.in_place:
+        dirname = os.path.abspath(os.path.dirname(args.ORIGINAL.name))
+
+        try:
+            # Attempt to replace the file atomically.  We do this by
+            # creating a temporary file in the same directory as the
+            # original file so we can atomically move the new file over
+            # the original later.  (This is done in the same directory
+	    # because atomic renames do not work across mount points.)
+
+            fd, pathname = tempfile.mkstemp(dir=dirname)
+            fp = os.fdopen(fd, 'w')
+            atomic = True
+
+        except OSError:
+            # We failed to create the temporary file for an atomic
+            # replace, so fall back to non-atomic mode by backing up
+            # the original (if desired) and writing a new file.
+
+            if args.backup:
+                os.rename(args.ORIGINAL.name, args.ORIGINAL.name + '.orig')
+            fp = open(args.ORIGINAL.name, 'w')
+            atomic = False
+
+    else:
+        # Since we're not replacing the original file in-place, write
+        # the modified JSON to stdout instead.
+
+        fp = sys.stdout
+
+    # By this point we have some sort of file object we can write the 
+    # modified JSON to.
+    
+    json.dump(result, fp, indent=args.indent, ensure_ascii=not(args.preserve_unicode))
+    fp.write('\n')
+
+    if args.in_place:
+        # Close the new file.  If we aren't replacing atomically, this
+        # is our last step, since everything else is already in place.
+
+        fp.close()
+
+        if atomic:
+            try:
+                # Complete the atomic replace by linking the original
+                # to a backup (if desired), fixing up the permissions
+                # on the temporary file, and moving it into place.
+
+                if args.backup:
+                    os.link(args.ORIGINAL.name, args.ORIGINAL.name + '.orig')
+                os.chmod(pathname, os.stat(args.ORIGINAL.name).st_mode)
+                os.rename(pathname, args.ORIGINAL.name)
+
+            except OSError:
+                # In the event we could not actually do the atomic
+                # replace, unlink the original to move it out of the
+                # way and finally move the temporary file into place.
+                
+                os.unlink(args.ORIGINAL.name)
+                os.rename(pathname, args.ORIGINAL.name)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.venv2/Scripts/jsonpointer b/.venv2/Scripts/jsonpointer
new file mode 100644
index 0000000..69feca9
--- /dev/null
+++ b/.venv2/Scripts/jsonpointer
@@ -0,0 +1,67 @@
+#!C:\Users\Kevin\Documents\Work\SCP\AI\RAG\appl-kgraph\.venv2\Scripts\python.exe
+# -*- coding: utf-8 -*-
+
+
+import argparse
+import json
+import sys
+
+import jsonpointer
+
+parser = argparse.ArgumentParser(
+    description='Resolve a JSON pointer on JSON files')
+
+# Accept pointer as argument or as file
+ptr_group = parser.add_mutually_exclusive_group(required=True)
+
+ptr_group.add_argument('-f', '--pointer-file', type=argparse.FileType('r'),
+                       nargs='?',
+                       help='File containing a JSON pointer expression')
+
+ptr_group.add_argument('POINTER', type=str, nargs='?',
+                       help='A JSON pointer expression')
+
+parser.add_argument('FILE', type=argparse.FileType('r'), nargs='+',
+                    help='Files for which the pointer should be resolved')
+parser.add_argument('--indent', type=int, default=None,
+                    help='Indent output by n spaces')
+parser.add_argument('-v', '--version', action='version',
+                    version='%(prog)s ' + jsonpointer.__version__)
+
+
+def main():
+    try:
+        resolve_files()
+    except KeyboardInterrupt:
+        sys.exit(1)
+
+
+def parse_pointer(args):
+    if args.POINTER:
+        ptr = args.POINTER
+    elif args.pointer_file:
+        ptr = args.pointer_file.read().strip()
+    else:
+        parser.print_usage()
+        sys.exit(1)
+
+    return ptr
+
+
+def resolve_files():
+    """ Resolve a JSON pointer on JSON files """
+    args = parser.parse_args()
+
+    ptr = parse_pointer(args)
+
+    for f in args.FILE:
+        doc = json.load(f)
+        try:
+            result = jsonpointer.resolve_pointer(doc, ptr)
+            print(json.dumps(result, indent=args.indent))
+        except jsonpointer.JsonPointerException as e:
+            print('Could not resolve pointer: %s' % str(e), file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.venv2/Scripts/jsonschema.exe b/.venv2/Scripts/jsonschema.exe
new file mode 100644
index 0000000..f3c8336
Binary files /dev/null and b/.venv2/Scripts/jsonschema.exe differ
diff --git a/.venv2/Scripts/markdown-it.exe b/.venv2/Scripts/markdown-it.exe
new file mode 100644
index 0000000..e3f6db7
Binary files /dev/null and b/.venv2/Scripts/markdown-it.exe differ
diff --git a/.venv2/Scripts/normalizer.exe b/.venv2/Scripts/normalizer.exe
new file mode 100644
index 0000000..64d6ed3
Binary files /dev/null and b/.venv2/Scripts/normalizer.exe differ
diff --git a/.venv2/Scripts/numpy-config.exe b/.venv2/Scripts/numpy-config.exe
new file mode 100644
index 0000000..1201da7
Binary files /dev/null and b/.venv2/Scripts/numpy-config.exe differ
diff --git a/.venv2/Scripts/onnxruntime_test.exe b/.venv2/Scripts/onnxruntime_test.exe
new file mode 100644
index 0000000..10f5d34
Binary files /dev/null and b/.venv2/Scripts/onnxruntime_test.exe differ
diff --git a/.venv2/Scripts/openai.exe b/.venv2/Scripts/openai.exe
new file mode 100644
index 0000000..12e5395
Binary files /dev/null and b/.venv2/Scripts/openai.exe differ
diff --git a/.venv2/Scripts/pip.exe b/.venv2/Scripts/pip.exe
new file mode 100644
index 0000000..550cac4
Binary files /dev/null and b/.venv2/Scripts/pip.exe differ
diff --git a/.venv2/Scripts/pip3.12.exe b/.venv2/Scripts/pip3.12.exe
new file mode 100644
index 0000000..550cac4
Binary files /dev/null and b/.venv2/Scripts/pip3.12.exe differ
diff --git a/.venv2/Scripts/pip3.exe b/.venv2/Scripts/pip3.exe
new file mode 100644
index 0000000..550cac4
Binary files /dev/null and b/.venv2/Scripts/pip3.exe differ
diff --git a/.venv2/Scripts/pybase64.exe b/.venv2/Scripts/pybase64.exe
new file mode 100644
index 0000000..c593d9d
Binary files /dev/null and b/.venv2/Scripts/pybase64.exe differ
diff --git a/.venv2/Scripts/pyftmerge.exe b/.venv2/Scripts/pyftmerge.exe
new file mode 100644
index 0000000..b733be4
Binary files /dev/null and b/.venv2/Scripts/pyftmerge.exe differ
diff --git a/.venv2/Scripts/pyftsubset.exe b/.venv2/Scripts/pyftsubset.exe
new file mode 100644
index 0000000..1218634
Binary files /dev/null and b/.venv2/Scripts/pyftsubset.exe differ
diff --git a/.venv2/Scripts/pygmentize.exe b/.venv2/Scripts/pygmentize.exe
new file mode 100644
index 0000000..24010ce
Binary files /dev/null and b/.venv2/Scripts/pygmentize.exe differ
diff --git a/.venv2/Scripts/pymupdf.exe b/.venv2/Scripts/pymupdf.exe
new file mode 100644
index 0000000..66eaa4b
Binary files /dev/null and b/.venv2/Scripts/pymupdf.exe differ
diff --git a/.venv2/Scripts/pyproject-build.exe b/.venv2/Scripts/pyproject-build.exe
new file mode 100644
index 0000000..f6cdc9e
Binary files /dev/null and b/.venv2/Scripts/pyproject-build.exe differ
diff --git a/.venv2/Scripts/pyrsa-decrypt.exe b/.venv2/Scripts/pyrsa-decrypt.exe
new file mode 100644
index 0000000..fb14df9
Binary files /dev/null and b/.venv2/Scripts/pyrsa-decrypt.exe differ
diff --git a/.venv2/Scripts/pyrsa-encrypt.exe b/.venv2/Scripts/pyrsa-encrypt.exe
new file mode 100644
index 0000000..8575f13
Binary files /dev/null and b/.venv2/Scripts/pyrsa-encrypt.exe differ
diff --git a/.venv2/Scripts/pyrsa-keygen.exe b/.venv2/Scripts/pyrsa-keygen.exe
new file mode 100644
index 0000000..b79ffc5
Binary files /dev/null and b/.venv2/Scripts/pyrsa-keygen.exe differ
diff --git a/.venv2/Scripts/pyrsa-priv2pub.exe b/.venv2/Scripts/pyrsa-priv2pub.exe
new file mode 100644
index 0000000..d246a18
Binary files /dev/null and b/.venv2/Scripts/pyrsa-priv2pub.exe differ
diff --git a/.venv2/Scripts/pyrsa-sign.exe b/.venv2/Scripts/pyrsa-sign.exe
new file mode 100644
index 0000000..c266f67
Binary files /dev/null and b/.venv2/Scripts/pyrsa-sign.exe differ
diff --git a/.venv2/Scripts/pyrsa-verify.exe b/.venv2/Scripts/pyrsa-verify.exe
new file mode 100644
index 0000000..bd87775
Binary files /dev/null and b/.venv2/Scripts/pyrsa-verify.exe differ
diff --git a/.venv2/Scripts/python.exe b/.venv2/Scripts/python.exe
new file mode 100644
index 0000000..1942b9a
Binary files /dev/null and b/.venv2/Scripts/python.exe differ
diff --git a/.venv2/Scripts/pythonw.exe b/.venv2/Scripts/pythonw.exe
new file mode 100644
index 0000000..e770f9d
Binary files /dev/null and b/.venv2/Scripts/pythonw.exe differ
diff --git a/.venv2/Scripts/pywin32_postinstall.exe b/.venv2/Scripts/pywin32_postinstall.exe
new file mode 100644
index 0000000..5a34b0f
Binary files /dev/null and b/.venv2/Scripts/pywin32_postinstall.exe differ
diff --git a/.venv2/Scripts/pywin32_postinstall.py b/.venv2/Scripts/pywin32_postinstall.py
new file mode 100644
index 0000000..7b1a1fd
--- /dev/null
+++ b/.venv2/Scripts/pywin32_postinstall.py
@@ -0,0 +1,733 @@
+# postinstall script for pywin32
+#
+# copies pywintypesXX.dll and pythoncomXX.dll into the system directory,
+# and creates a pth file
+import argparse
+import glob
+import os
+import shutil
+import sys
+import sysconfig
+import tempfile
+import winreg
+
+tee_f = open(
+    os.path.join(
+        tempfile.gettempdir(),  # Send output somewhere so it can be found if necessary...
+        "pywin32_postinstall.log",
+    ),
+    "w",
+)
+
+
+class Tee:
+    def __init__(self, file):
+        self.f = file
+
+    def write(self, what):
+        if self.f is not None:
+            try:
+                self.f.write(what.replace("\n", "\r\n"))
+            except OSError:
+                pass
+        tee_f.write(what)
+
+    def flush(self):
+        if self.f is not None:
+            try:
+                self.f.flush()
+            except OSError:
+                pass
+        tee_f.flush()
+
+
+sys.stderr = Tee(sys.stderr)
+sys.stdout = Tee(sys.stdout)
+
+com_modules = [
+    # module_name,                      class_names
+    ("win32com.servers.interp", "Interpreter"),
+    ("win32com.servers.dictionary", "DictionaryPolicy"),
+    ("win32com.axscript.client.pyscript", "PyScript"),
+]
+
+# Is this a 'silent' install - ie, avoid all dialogs.
+# Different than 'verbose'
+silent = 0
+
+# Verbosity of output messages.
+verbose = 1
+
+root_key_name = "Software\\Python\\PythonCore\\" + sys.winver
+
+
+def get_root_hkey():
+    try:
+        winreg.OpenKey(
+            winreg.HKEY_LOCAL_MACHINE, root_key_name, 0, winreg.KEY_CREATE_SUB_KEY
+        )
+        return winreg.HKEY_LOCAL_MACHINE
+    except OSError:
+        # Either not exist, or no permissions to create subkey means
+        # must be HKCU
+        return winreg.HKEY_CURRENT_USER
+
+
+# Create a function with the same signature as create_shortcut
+# previously provided by bdist_wininst
+def create_shortcut(
+    path, description, filename, arguments="", workdir="", iconpath="", iconindex=0
+):
+    import pythoncom
+    from win32com.shell import shell
+
+    ilink = pythoncom.CoCreateInstance(
+        shell.CLSID_ShellLink,
+        None,
+        pythoncom.CLSCTX_INPROC_SERVER,
+        shell.IID_IShellLink,
+    )
+    ilink.SetPath(path)
+    ilink.SetDescription(description)
+    if arguments:
+        ilink.SetArguments(arguments)
+    if workdir:
+        ilink.SetWorkingDirectory(workdir)
+    if iconpath or iconindex:
+        ilink.SetIconLocation(iconpath, iconindex)
+    # now save it.
+    ipf = ilink.QueryInterface(pythoncom.IID_IPersistFile)
+    ipf.Save(filename, 0)
+
+
+# Support the same list of "path names" as bdist_wininst used to
+def get_special_folder_path(path_name):
+    from win32com.shell import shell, shellcon
+
+    for maybe in """
+        CSIDL_COMMON_STARTMENU CSIDL_STARTMENU CSIDL_COMMON_APPDATA
+        CSIDL_LOCAL_APPDATA CSIDL_APPDATA CSIDL_COMMON_DESKTOPDIRECTORY
+        CSIDL_DESKTOPDIRECTORY CSIDL_COMMON_STARTUP CSIDL_STARTUP
+        CSIDL_COMMON_PROGRAMS CSIDL_PROGRAMS CSIDL_PROGRAM_FILES_COMMON
+        CSIDL_PROGRAM_FILES CSIDL_FONTS""".split():
+        if maybe == path_name:
+            csidl = getattr(shellcon, maybe)
+            return shell.SHGetSpecialFolderPath(0, csidl, False)
+    raise ValueError(f"{path_name} is an unknown path ID")
+
+
+def CopyTo(desc, src, dest):
+    import win32api
+    import win32con
+
+    while 1:
+        try:
+            win32api.CopyFile(src, dest, 0)
+            return
+        except win32api.error as details:
+            if details.winerror == 5:  # access denied - user not admin.
+                raise
+            if silent:
+                # Running silent mode - just re-raise the error.
+                raise
+            full_desc = (
+                f"Error {desc}\n\n"
+                "If you have any Python applications running, "
+                f"please close them now\nand select 'Retry'\n\n{details.strerror}"
+            )
+            rc = win32api.MessageBox(
+                0, full_desc, "Installation Error", win32con.MB_ABORTRETRYIGNORE
+            )
+            if rc == win32con.IDABORT:
+                raise
+            elif rc == win32con.IDIGNORE:
+                return
+            # else retry - around we go again.
+
+
+# We need to import win32api to determine the Windows system directory,
+# so we can copy our system files there - but importing win32api will
+# load the pywintypes.dll already in the system directory preventing us
+# from updating them!
+# So, we pull the same trick pywintypes.py does, but it loads from
+# our pywintypes_system32 directory.
+def LoadSystemModule(lib_dir, modname):
+    # See if this is a debug build.
+    import importlib.machinery
+    import importlib.util
+
+    suffix = "_d" if "_d.pyd" in importlib.machinery.EXTENSION_SUFFIXES else ""
+    filename = "%s%d%d%s.dll" % (
+        modname,
+        sys.version_info.major,
+        sys.version_info.minor,
+        suffix,
+    )
+    filename = os.path.join(lib_dir, "pywin32_system32", filename)
+    loader = importlib.machinery.ExtensionFileLoader(modname, filename)
+    spec = importlib.machinery.ModuleSpec(name=modname, loader=loader, origin=filename)
+    mod = importlib.util.module_from_spec(spec)
+    loader.exec_module(mod)
+
+
+def SetPyKeyVal(key_name, value_name, value):
+    root_hkey = get_root_hkey()
+    root_key = winreg.OpenKey(root_hkey, root_key_name)
+    try:
+        my_key = winreg.CreateKey(root_key, key_name)
+        try:
+            winreg.SetValueEx(my_key, value_name, 0, winreg.REG_SZ, value)
+            if verbose:
+                print(f"-> {root_key_name}\\{key_name}[{value_name}]={value!r}")
+        finally:
+            my_key.Close()
+    finally:
+        root_key.Close()
+
+
+def UnsetPyKeyVal(key_name, value_name, delete_key=False):
+    root_hkey = get_root_hkey()
+    root_key = winreg.OpenKey(root_hkey, root_key_name)
+    try:
+        my_key = winreg.OpenKey(root_key, key_name, 0, winreg.KEY_SET_VALUE)
+        try:
+            winreg.DeleteValue(my_key, value_name)
+            if verbose:
+                print(f"-> DELETE {root_key_name}\\{key_name}[{value_name}]")
+        finally:
+            my_key.Close()
+        if delete_key:
+            winreg.DeleteKey(root_key, key_name)
+            if verbose:
+                print(f"-> DELETE {root_key_name}\\{key_name}")
+    except OSError as why:
+        winerror = getattr(why, "winerror", why.errno)
+        if winerror != 2:  # file not found
+            raise
+    finally:
+        root_key.Close()
+
+
+def RegisterCOMObjects(register=True):
+    import win32com.server.register
+
+    if register:
+        func = win32com.server.register.RegisterClasses
+    else:
+        func = win32com.server.register.UnregisterClasses
+    flags = {}
+    if not verbose:
+        flags["quiet"] = 1
+    for module, klass_name in com_modules:
+        __import__(module)
+        mod = sys.modules[module]
+        flags["finalize_register"] = getattr(mod, "DllRegisterServer", None)
+        flags["finalize_unregister"] = getattr(mod, "DllUnregisterServer", None)
+        klass = getattr(mod, klass_name)
+        func(klass, **flags)
+
+
+def RegisterHelpFile(register=True, lib_dir=None):
+    if lib_dir is None:
+        lib_dir = sysconfig.get_paths()["platlib"]
+    if register:
+        # Register the .chm help file.
+        chm_file = os.path.join(lib_dir, "PyWin32.chm")
+        if os.path.isfile(chm_file):
+            # This isn't recursive, so if 'Help' doesn't exist, we croak
+            SetPyKeyVal("Help", None, None)
+            SetPyKeyVal("Help\\Pythonwin Reference", None, chm_file)
+            return chm_file
+        else:
+            print("NOTE: PyWin32.chm can not be located, so has not been registered")
+    else:
+        UnsetPyKeyVal("Help\\Pythonwin Reference", None, delete_key=True)
+    return None
+
+
+def RegisterPythonwin(register=True, lib_dir=None):
+    """Add (or remove) Pythonwin to context menu for python scripts.
+    ??? Should probably also add Edit command for pys files also.
+    Also need to remove these keys on uninstall, but there's no function
+    to add registry entries to uninstall log ???
+    """
+    import os
+
+    if lib_dir is None:
+        lib_dir = sysconfig.get_paths()["platlib"]
+    classes_root = get_root_hkey()
+    ## Installer executable doesn't seem to pass anything to postinstall script indicating if it's a debug build
+    pythonwin_exe = os.path.join(lib_dir, "Pythonwin", "Pythonwin.exe")
+    pythonwin_edit_command = pythonwin_exe + ' -edit "%1"'
+
+    keys_vals = [
+        (
+            "Software\\Microsoft\\Windows\\CurrentVersion\\App Paths\\Pythonwin.exe",
+            "",
+            pythonwin_exe,
+        ),
+        (
+            "Software\\Classes\\Python.File\\shell\\Edit with Pythonwin",
+            "command",
+            pythonwin_edit_command,
+        ),
+        (
+            "Software\\Classes\\Python.NoConFile\\shell\\Edit with Pythonwin",
+            "command",
+            pythonwin_edit_command,
+        ),
+    ]
+
+    try:
+        if register:
+            for key, sub_key, val in keys_vals:
+                ## Since winreg only uses the character Api functions, this can fail if Python
+                ##  is installed to a path containing non-ascii characters
+                hkey = winreg.CreateKey(classes_root, key)
+                if sub_key:
+                    hkey = winreg.CreateKey(hkey, sub_key)
+                winreg.SetValueEx(hkey, None, 0, winreg.REG_SZ, val)
+                hkey.Close()
+        else:
+            for key, sub_key, val in keys_vals:
+                try:
+                    if sub_key:
+                        hkey = winreg.OpenKey(classes_root, key)
+                        winreg.DeleteKey(hkey, sub_key)
+                        hkey.Close()
+                    winreg.DeleteKey(classes_root, key)
+                except OSError as why:
+                    winerror = getattr(why, "winerror", why.errno)
+                    if winerror != 2:  # file not found
+                        raise
+    finally:
+        # tell windows about the change
+        from win32com.shell import shell, shellcon
+
+        shell.SHChangeNotify(
+            shellcon.SHCNE_ASSOCCHANGED, shellcon.SHCNF_IDLIST, None, None
+        )
+
+
+def get_shortcuts_folder():
+    if get_root_hkey() == winreg.HKEY_LOCAL_MACHINE:
+        try:
+            fldr = get_special_folder_path("CSIDL_COMMON_PROGRAMS")
+        except OSError:
+            # No CSIDL_COMMON_PROGRAMS on this platform
+            fldr = get_special_folder_path("CSIDL_PROGRAMS")
+    else:
+        # non-admin install - always goes in this user's start menu.
+        fldr = get_special_folder_path("CSIDL_PROGRAMS")
+
+    try:
+        install_group = winreg.QueryValue(
+            get_root_hkey(), root_key_name + "\\InstallPath\\InstallGroup"
+        )
+    except OSError:
+        install_group = "Python %d.%d" % (
+            sys.version_info.major,
+            sys.version_info.minor,
+        )
+    return os.path.join(fldr, install_group)
+
+
+# Get the system directory, which may be the Wow64 directory if we are a 32bit
+# python on a 64bit OS.
+def get_system_dir():
+    import win32api  # we assume this exists.
+
+    try:
+        import pythoncom
+        import win32process
+        from win32com.shell import shell, shellcon
+
+        try:
+            if win32process.IsWow64Process():
+                return shell.SHGetSpecialFolderPath(0, shellcon.CSIDL_SYSTEMX86)
+            return shell.SHGetSpecialFolderPath(0, shellcon.CSIDL_SYSTEM)
+        except (pythoncom.com_error, win32process.error):
+            return win32api.GetSystemDirectory()
+    except ImportError:
+        return win32api.GetSystemDirectory()
+
+
+def fixup_dbi():
+    # We used to have a dbi.pyd with our .pyd files, but now have a .py file.
+    # If the user didn't uninstall, they will find the .pyd which will cause
+    # problems - so handle that.
+    import win32api
+    import win32con
+
+    pyd_name = os.path.join(os.path.dirname(win32api.__file__), "dbi.pyd")
+    pyd_d_name = os.path.join(os.path.dirname(win32api.__file__), "dbi_d.pyd")
+    py_name = os.path.join(os.path.dirname(win32con.__file__), "dbi.py")
+    for this_pyd in (pyd_name, pyd_d_name):
+        this_dest = this_pyd + ".old"
+        if os.path.isfile(this_pyd) and os.path.isfile(py_name):
+            try:
+                if os.path.isfile(this_dest):
+                    print(
+                        f"Old dbi '{this_dest}' already exists - deleting '{this_pyd}'"
+                    )
+                    os.remove(this_pyd)
+                else:
+                    os.rename(this_pyd, this_dest)
+                    print(f"renamed '{this_pyd}'->'{this_pyd}.old'")
+            except OSError as exc:
+                print(f"FAILED to rename '{this_pyd}': {exc}")
+
+
+def install(lib_dir):
+    import traceback
+
+    # The .pth file is now installed as a regular file.
+    # Create the .pth file in the site-packages dir, and use only relative paths
+    # We used to write a .pth directly to sys.prefix - clobber it.
+    if os.path.isfile(os.path.join(sys.prefix, "pywin32.pth")):
+        os.unlink(os.path.join(sys.prefix, "pywin32.pth"))
+    # The .pth may be new and therefore not loaded in this session.
+    # Setup the paths just in case.
+    for name in "win32 win32\\lib Pythonwin".split():
+        sys.path.append(os.path.join(lib_dir, name))
+    # It is possible people with old versions installed with still have
+    # pywintypes and pythoncom registered.  We no longer need this, and stale
+    # entries hurt us.
+    for name in "pythoncom pywintypes".split():
+        keyname = "Software\\Python\\PythonCore\\" + sys.winver + "\\Modules\\" + name
+        for root in winreg.HKEY_LOCAL_MACHINE, winreg.HKEY_CURRENT_USER:
+            try:
+                winreg.DeleteKey(root, keyname + "\\Debug")
+            except OSError:
+                pass
+            try:
+                winreg.DeleteKey(root, keyname)
+            except OSError:
+                pass
+    LoadSystemModule(lib_dir, "pywintypes")
+    LoadSystemModule(lib_dir, "pythoncom")
+    import win32api
+
+    # and now we can get the system directory:
+    files = glob.glob(os.path.join(lib_dir, "pywin32_system32\\*.*"))
+    if not files:
+        raise RuntimeError("No system files to copy!!")
+    # Try the system32 directory first - if that fails due to "access denied",
+    # it implies a non-admin user, and we use sys.prefix
+    for dest_dir in [get_system_dir(), sys.prefix]:
+        # and copy some files over there
+        worked = 0
+        try:
+            for fname in files:
+                base = os.path.basename(fname)
+                dst = os.path.join(dest_dir, base)
+                CopyTo("installing %s" % base, fname, dst)
+                if verbose:
+                    print(f"Copied {base} to {dst}")
+                worked = 1
+                # Nuke any other versions that may exist - having
+                # duplicates causes major headaches.
+                bad_dest_dirs = [
+                    os.path.join(sys.prefix, "Library\\bin"),
+                    os.path.join(sys.prefix, "Lib\\site-packages\\win32"),
+                ]
+                if dest_dir != sys.prefix:
+                    bad_dest_dirs.append(sys.prefix)
+                for bad_dest_dir in bad_dest_dirs:
+                    bad_fname = os.path.join(bad_dest_dir, base)
+                    if os.path.exists(bad_fname):
+                        # let exceptions go here - delete must succeed
+                        os.unlink(bad_fname)
+            if worked:
+                break
+        except win32api.error as details:
+            if details.winerror == 5:
+                # access denied - user not admin - try sys.prefix dir,
+                # but first check that a version doesn't already exist
+                # in that place - otherwise that one will still get used!
+                if os.path.exists(dst):
+                    msg = (
+                        "The file '%s' exists, but can not be replaced "
+                        "due to insufficient permissions.  You must "
+                        "reinstall this software as an Administrator" % dst
+                    )
+                    print(msg)
+                    raise RuntimeError(msg)
+                continue
+            raise
+    else:
+        raise RuntimeError(
+            "You don't have enough permissions to install the system files"
+        )
+
+    # Register our demo COM objects.
+    try:
+        try:
+            RegisterCOMObjects()
+        except win32api.error as details:
+            if details.winerror != 5:  # ERROR_ACCESS_DENIED
+                raise
+            print("You do not have the permissions to install COM objects.")
+            print("The sample COM objects were not registered.")
+    except Exception:
+        print("FAILED to register the Python COM objects")
+        traceback.print_exc()
+
+    # There may be no main Python key in HKCU if, eg, an admin installed
+    # python itself.
+    winreg.CreateKey(get_root_hkey(), root_key_name)
+
+    chm_file = None
+    try:
+        chm_file = RegisterHelpFile(True, lib_dir)
+    except Exception:
+        print("Failed to register help file")
+        traceback.print_exc()
+    else:
+        if verbose:
+            print("Registered help file")
+
+    # misc other fixups.
+    fixup_dbi()
+
+    # Register Pythonwin in context menu
+    try:
+        RegisterPythonwin(True, lib_dir)
+    except Exception:
+        print("Failed to register pythonwin as editor")
+        traceback.print_exc()
+    else:
+        if verbose:
+            print("Pythonwin has been registered in context menu")
+
+    # Create the win32com\gen_py directory.
+    make_dir = os.path.join(lib_dir, "win32com", "gen_py")
+    if not os.path.isdir(make_dir):
+        if verbose:
+            print(f"Creating directory {make_dir}")
+        os.mkdir(make_dir)
+
+    try:
+        # create shortcuts
+        # CSIDL_COMMON_PROGRAMS only available works on NT/2000/XP, and
+        # will fail there if the user has no admin rights.
+        fldr = get_shortcuts_folder()
+        # If the group doesn't exist, then we don't make shortcuts - its
+        # possible that this isn't a "normal" install.
+        if os.path.isdir(fldr):
+            dst = os.path.join(fldr, "PythonWin.lnk")
+            create_shortcut(
+                os.path.join(lib_dir, "Pythonwin\\Pythonwin.exe"),
+                "The Pythonwin IDE",
+                dst,
+                "",
+                sys.prefix,
+            )
+            if verbose:
+                print("Shortcut for Pythonwin created")
+            # And the docs.
+            if chm_file:
+                dst = os.path.join(fldr, "Python for Windows Documentation.lnk")
+                doc = "Documentation for the PyWin32 extensions"
+                create_shortcut(chm_file, doc, dst)
+                if verbose:
+                    print("Shortcut to documentation created")
+        else:
+            if verbose:
+                print(f"Can't install shortcuts - {fldr!r} is not a folder")
+    except Exception as details:
+        print(details)
+
+    # importing win32com.client ensures the gen_py dir created - not strictly
+    # necessary to do now, but this makes the installation "complete"
+    try:
+        import win32com.client  # noqa
+    except ImportError:
+        # Don't let this error sound fatal
+        pass
+    print("The pywin32 extensions were successfully installed.")
+
+
+def uninstall(lib_dir):
+    # First ensure our system modules are loaded from pywin32_system, so
+    # we can remove the ones we copied...
+    LoadSystemModule(lib_dir, "pywintypes")
+    LoadSystemModule(lib_dir, "pythoncom")
+
+    try:
+        RegisterCOMObjects(False)
+    except Exception as why:
+        print(f"Failed to unregister COM objects: {why}")
+
+    try:
+        RegisterHelpFile(False, lib_dir)
+    except Exception as why:
+        print(f"Failed to unregister help file: {why}")
+    else:
+        if verbose:
+            print("Unregistered help file")
+
+    try:
+        RegisterPythonwin(False, lib_dir)
+    except Exception as why:
+        print(f"Failed to unregister Pythonwin: {why}")
+    else:
+        if verbose:
+            print("Unregistered Pythonwin")
+
+    try:
+        # remove gen_py directory.
+        gen_dir = os.path.join(lib_dir, "win32com", "gen_py")
+        if os.path.isdir(gen_dir):
+            shutil.rmtree(gen_dir)
+            if verbose:
+                print(f"Removed directory {gen_dir}")
+
+        # Remove pythonwin compiled "config" files.
+        pywin_dir = os.path.join(lib_dir, "Pythonwin", "pywin")
+        for fname in glob.glob(os.path.join(pywin_dir, "*.cfc")):
+            os.remove(fname)
+
+        # The dbi.pyd.old files we may have created.
+        try:
+            os.remove(os.path.join(lib_dir, "win32", "dbi.pyd.old"))
+        except OSError:
+            pass
+        try:
+            os.remove(os.path.join(lib_dir, "win32", "dbi_d.pyd.old"))
+        except OSError:
+            pass
+
+    except Exception as why:
+        print(f"Failed to remove misc files: {why}")
+
+    try:
+        fldr = get_shortcuts_folder()
+        for link in ("PythonWin.lnk", "Python for Windows Documentation.lnk"):
+            fqlink = os.path.join(fldr, link)
+            if os.path.isfile(fqlink):
+                os.remove(fqlink)
+                if verbose:
+                    print(f"Removed {link}")
+    except Exception as why:
+        print(f"Failed to remove shortcuts: {why}")
+    # Now remove the system32 files.
+    files = glob.glob(os.path.join(lib_dir, "pywin32_system32\\*.*"))
+    # Try the system32 directory first - if that fails due to "access denied",
+    # it implies a non-admin user, and we use sys.prefix
+    try:
+        for dest_dir in [get_system_dir(), sys.prefix]:
+            # and copy some files over there
+            worked = 0
+            for fname in files:
+                base = os.path.basename(fname)
+                dst = os.path.join(dest_dir, base)
+                if os.path.isfile(dst):
+                    try:
+                        os.remove(dst)
+                        worked = 1
+                        if verbose:
+                            print("Removed file %s" % (dst))
+                    except Exception:
+                        print(f"FAILED to remove {dst}")
+            if worked:
+                break
+    except Exception as why:
+        print(f"FAILED to remove system files: {why}")
+
+
+# NOTE: This used to be run from inside the bdist_wininst created binary un/installer.
+# From inside the binary installer this script HAD to NOT
+# call sys.exit() or raise SystemExit, otherwise the installer would also terminate!
+# Out of principle, we're still not using system exits.
+
+
+def verify_destination(location: str) -> str:
+    location = os.path.abspath(location)
+    if not os.path.isdir(location):
+        raise argparse.ArgumentTypeError(
+            f'Path "{location}" is not an existing directory!'
+        )
+    return location
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="""A post-install script for the pywin32 extensions.
+
+    * Typical usage:
+
+    > python -m pywin32_postinstall -install
+
+    * or (shorter but you don't have control over which python environment is used)
+
+    > pywin32_postinstall -install
+
+    You need to execute this script, with a '-install' parameter,
+    to ensure the environment is setup correctly to install COM objects, services, etc.
+    """,
+    )
+    parser.add_argument(
+        "-install",
+        default=False,
+        action="store_true",
+        help="Configure the Python environment correctly for pywin32.",
+    )
+    parser.add_argument(
+        "-remove",
+        default=False,
+        action="store_true",
+        help="Try and remove everything that was installed or copied.",
+    )
+    parser.add_argument(
+        "-wait",
+        type=int,
+        help="Wait for the specified process to terminate before starting.",
+    )
+    parser.add_argument(
+        "-silent",
+        default=False,
+        action="store_true",
+        help='Don\'t display the "Abort/Retry/Ignore" dialog for files in use.',
+    )
+    parser.add_argument(
+        "-quiet",
+        default=False,
+        action="store_true",
+        help="Don't display progress messages.",
+    )
+    parser.add_argument(
+        "-destination",
+        default=sysconfig.get_paths()["platlib"],
+        type=verify_destination,
+        help="Location of the PyWin32 installation",
+    )
+
+    args = parser.parse_args()
+
+    if not args.quiet:
+        print(f"Parsed arguments are: {args}")
+
+    if not args.install ^ args.remove:
+        parser.error("You need to either choose to -install or -remove!")
+
+    if args.wait is not None:
+        try:
+            os.waitpid(args.wait, 0)
+        except OSError:
+            # child already dead
+            pass
+
+    silent = args.silent
+    verbose = not args.quiet
+
+    if args.install:
+        install(args.destination)
+
+    if args.remove:
+        uninstall(args.destination)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.venv2/Scripts/pywin32_testall.exe b/.venv2/Scripts/pywin32_testall.exe
new file mode 100644
index 0000000..381c7ca
Binary files /dev/null and b/.venv2/Scripts/pywin32_testall.exe differ
diff --git a/.venv2/Scripts/pywin32_testall.py b/.venv2/Scripts/pywin32_testall.py
new file mode 100644
index 0000000..0880a1d
--- /dev/null
+++ b/.venv2/Scripts/pywin32_testall.py
@@ -0,0 +1,120 @@
+"""A test runner for pywin32"""
+
+import os
+import site
+import subprocess
+import sys
+
+# locate the dirs based on where this script is - it may be either in the
+# source tree, or in an installed Python 'Scripts' tree.
+project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+site_packages = [site.getusersitepackages()] + site.getsitepackages()
+
+failures = []
+
+
+# Run a test using subprocess and wait for the result.
+# If we get an returncode != 0, we know that there was an error, but we don't
+# abort immediately - we run as many tests as we can.
+def run_test(script, cmdline_extras):
+    dirname, scriptname = os.path.split(script)
+    # some tests prefer to be run from their directory.
+    cmd = [sys.executable, "-u", scriptname] + cmdline_extras
+    print("--- Running '%s' ---" % script)
+    sys.stdout.flush()
+    result = subprocess.run(cmd, check=False, cwd=dirname)
+    print(f"*** Test script '{script}' exited with {result.returncode}")
+    sys.stdout.flush()
+    if result.returncode:
+        failures.append(script)
+
+
+def find_and_run(possible_locations, extras):
+    for maybe in possible_locations:
+        if os.path.isfile(maybe):
+            run_test(maybe, extras)
+            break
+    else:
+        raise RuntimeError(
+            "Failed to locate a test script in one of %s" % possible_locations
+        )
+
+
+def main():
+    import argparse
+
+    code_directories = [project_root] + site_packages
+
+    parser = argparse.ArgumentParser(
+        description="A script to trigger tests in all subprojects of PyWin32."
+    )
+    parser.add_argument(
+        "-no-user-interaction",
+        default=False,
+        action="store_true",
+        help="(This is now the default - use `-user-interaction` to include them)",
+    )
+
+    parser.add_argument(
+        "-user-interaction",
+        action="store_true",
+        help="Include tests which require user interaction",
+    )
+
+    parser.add_argument(
+        "-skip-adodbapi",
+        default=False,
+        action="store_true",
+        help="Skip the adodbapi tests; useful for CI where there's no provider",
+    )
+
+    args, remains = parser.parse_known_args()
+
+    # win32, win32ui / Pythonwin
+
+    extras = []
+    if args.user_interaction:
+        extras.append("-user-interaction")
+    extras.extend(remains)
+    scripts = [
+        "win32/test/testall.py",
+        "Pythonwin/pywin/test/all.py",
+    ]
+    for script in scripts:
+        maybes = [os.path.join(directory, script) for directory in code_directories]
+        find_and_run(maybes, extras)
+
+    # win32com
+    maybes = [
+        os.path.join(directory, "win32com", "test", "testall.py")
+        for directory in [os.path.join(project_root, "com")] + site_packages
+    ]
+    extras = remains + ["1"]  # only run "level 1" tests in CI
+    find_and_run(maybes, extras)
+
+    # adodbapi
+    if not args.skip_adodbapi:
+        maybes = [
+            os.path.join(directory, "adodbapi", "test", "adodbapitest.py")
+            for directory in code_directories
+        ]
+        find_and_run(maybes, remains)
+        # This script has a hard-coded sql server name in it, (and markh typically
+        # doesn't have a different server to test on) but there is now supposed to be a server out there on the Internet
+        # just to run these tests, so try it...
+        maybes = [
+            os.path.join(directory, "adodbapi", "test", "test_adodbapi_dbapi20.py")
+            for directory in code_directories
+        ]
+        find_and_run(maybes, remains)
+
+    if failures:
+        print("The following scripts failed")
+        for failure in failures:
+            print(">", failure)
+        sys.exit(1)
+    print("All tests passed \\o/")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.venv2/Scripts/ruff.exe b/.venv2/Scripts/ruff.exe
new file mode 100644
index 0000000..4d63508
Binary files /dev/null and b/.venv2/Scripts/ruff.exe differ
diff --git a/.venv2/Scripts/tiny-agents.exe b/.venv2/Scripts/tiny-agents.exe
new file mode 100644
index 0000000..2417f3a
Binary files /dev/null and b/.venv2/Scripts/tiny-agents.exe differ
diff --git a/.venv2/Scripts/tqdm.exe b/.venv2/Scripts/tqdm.exe
new file mode 100644
index 0000000..2e50f3b
Binary files /dev/null and b/.venv2/Scripts/tqdm.exe differ
diff --git a/.venv2/Scripts/ttx.exe b/.venv2/Scripts/ttx.exe
new file mode 100644
index 0000000..a8b2691
Binary files /dev/null and b/.venv2/Scripts/ttx.exe differ
diff --git a/.venv2/Scripts/typer.exe b/.venv2/Scripts/typer.exe
new file mode 100644
index 0000000..d82c9a9
Binary files /dev/null and b/.venv2/Scripts/typer.exe differ
diff --git a/.venv2/Scripts/upload_theme.exe b/.venv2/Scripts/upload_theme.exe
new file mode 100644
index 0000000..1a95626
Binary files /dev/null and b/.venv2/Scripts/upload_theme.exe differ
diff --git a/.venv2/Scripts/uvicorn.exe b/.venv2/Scripts/uvicorn.exe
new file mode 100644
index 0000000..0839f1e
Binary files /dev/null and b/.venv2/Scripts/uvicorn.exe differ
diff --git a/.venv2/Scripts/watchfiles.exe b/.venv2/Scripts/watchfiles.exe
new file mode 100644
index 0000000..a34e57e
Binary files /dev/null and b/.venv2/Scripts/watchfiles.exe differ
diff --git a/.venv2/Scripts/websockets.exe b/.venv2/Scripts/websockets.exe
new file mode 100644
index 0000000..5993bc8
Binary files /dev/null and b/.venv2/Scripts/websockets.exe differ
diff --git a/.venv2/Scripts/wsdump.exe b/.venv2/Scripts/wsdump.exe
new file mode 100644
index 0000000..e959211
Binary files /dev/null and b/.venv2/Scripts/wsdump.exe differ
diff --git a/.venv2/share/man/man1/ipython.1 b/.venv2/share/man/man1/ipython.1
new file mode 100644
index 0000000..0f4a191
--- /dev/null
+++ b/.venv2/share/man/man1/ipython.1
@@ -0,0 +1,60 @@
+.\"                                      Hey, EMACS: -*- nroff -*-
+.\" First parameter, NAME, should be all caps
+.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection
+.\" other parameters are allowed: see man(7), man(1)
+.TH IPYTHON 1 "July 15, 2011"
+.\" Please adjust this date whenever revising the manpage.
+.\"
+.\" Some roff macros, for reference:
+.\" .nh        disable hyphenation
+.\" .hy        enable hyphenation
+.\" .ad l      left justify
+.\" .ad b      justify to both left and right margins
+.\" .nf        disable filling
+.\" .fi        enable filling
+.\" .br        insert line break
+.\" .sp <n>    insert n+1 empty lines
+.\" for manpage-specific macros, see man(7) and groff_man(7)
+.\" .SH        section heading
+.\" .SS        secondary section heading
+.\"
+.\"
+.\" To preview this page as plain text: nroff -man ipython.1
+.\"
+.SH NAME
+ipython \- Tools for Interactive Computing in Python.
+.SH SYNOPSIS
+.B ipython
+.RI [ options ] " files" ...
+
+.B ipython subcommand
+.RI [ options ] ...
+
+.SH DESCRIPTION
+An interactive Python shell with automatic history (input and output), dynamic
+object introspection, easier configuration, command completion, access to the
+system shell, integration with numerical and scientific computing tools,
+web notebook, Qt console, and more.
+
+For more information on how to use IPython, see 'ipython \-\-help',
+or 'ipython \-\-help\-all' for all available command\(hyline options.
+
+.SH "ENVIRONMENT VARIABLES"
+.sp
+.PP
+\fIIPYTHONDIR\fR
+.RS 4
+This is the location where IPython stores all its configuration files.  The default
+is $HOME/.ipython if IPYTHONDIR is not defined.
+
+You can see the computed value of IPYTHONDIR with `ipython locate`.
+
+.SH FILES
+
+IPython uses various configuration files stored in profiles within IPYTHONDIR.
+To generate the default configuration files and start configuring IPython,
+do 'ipython profile create', and edit '*_config.py' files located in
+IPYTHONDIR/profile_default.
+
+.SH AUTHORS
+IPython is written by the IPython Development Team <https://github.com/ipython/ipython>.
diff --git a/.venv2/share/man/man1/isympy.1 b/.venv2/share/man/man1/isympy.1
new file mode 100644
index 0000000..0ff9661
--- /dev/null
+++ b/.venv2/share/man/man1/isympy.1
@@ -0,0 +1,188 @@
+'\" -*- coding: us-ascii -*-
+.if \n(.g .ds T< \\FC
+.if \n(.g .ds T> \\F[\n[.fam]]
+.de URL
+\\$2 \(la\\$1\(ra\\$3
+..
+.if \n(.g .mso www.tmac
+.TH isympy 1 2007-10-8 "" ""
+.SH NAME
+isympy \- interactive shell for SymPy
+.SH SYNOPSIS
+'nh
+.fi
+.ad l
+\fBisympy\fR \kx
+.if (\nx>(\n(.l/2)) .nr x (\n(.l/5)
+'in \n(.iu+\nxu
+[\fB-c\fR | \fB--console\fR] [\fB-p\fR ENCODING | \fB--pretty\fR ENCODING] [\fB-t\fR TYPE | \fB--types\fR TYPE] [\fB-o\fR ORDER | \fB--order\fR ORDER] [\fB-q\fR | \fB--quiet\fR] [\fB-d\fR | \fB--doctest\fR] [\fB-C\fR | \fB--no-cache\fR] [\fB-a\fR | \fB--auto\fR] [\fB-D\fR | \fB--debug\fR] [
+-- | PYTHONOPTIONS]
+'in \n(.iu-\nxu
+.ad b
+'hy
+'nh
+.fi
+.ad l
+\fBisympy\fR \kx
+.if (\nx>(\n(.l/2)) .nr x (\n(.l/5)
+'in \n(.iu+\nxu
+[
+{\fB-h\fR | \fB--help\fR}
+|
+{\fB-v\fR | \fB--version\fR}
+]
+'in \n(.iu-\nxu
+.ad b
+'hy
+.SH DESCRIPTION
+isympy is a Python shell for SymPy. It is just a normal python shell
+(ipython shell if you have the ipython package installed) that executes
+the following commands so that you don't have to:
+.PP
+.nf
+\*(T<
+>>> from __future__ import division
+>>> from sympy import *
+>>> x, y, z = symbols("x,y,z")
+>>> k, m, n = symbols("k,m,n", integer=True)
+    \*(T>
+.fi
+.PP
+So starting isympy is equivalent to starting python (or ipython) and
+executing the above commands by hand. It is intended for easy and quick
+experimentation with SymPy. For more complicated programs, it is recommended
+to write a script and import things explicitly (using the "from sympy
+import sin, log, Symbol, ..." idiom).
+.SH OPTIONS
+.TP
+\*(T<\fB\-c \fR\*(T>\fISHELL\fR, \*(T<\fB\-\-console=\fR\*(T>\fISHELL\fR
+Use the specified shell (python or ipython) as
+console backend instead of the default one (ipython
+if present or python otherwise).
+
+Example: isympy -c python
+
+\fISHELL\fR could be either
+\&'ipython' or 'python'
+.TP
+\*(T<\fB\-p \fR\*(T>\fIENCODING\fR, \*(T<\fB\-\-pretty=\fR\*(T>\fIENCODING\fR
+Setup pretty printing in SymPy. By default, the most pretty, unicode
+printing is enabled (if the terminal supports it). You can use less
+pretty ASCII printing instead or no pretty printing at all.
+
+Example: isympy -p no
+
+\fIENCODING\fR must be one of 'unicode',
+\&'ascii' or 'no'.
+.TP
+\*(T<\fB\-t \fR\*(T>\fITYPE\fR, \*(T<\fB\-\-types=\fR\*(T>\fITYPE\fR
+Setup the ground types for the polys. By default, gmpy ground types
+are used if gmpy2 or gmpy is installed, otherwise it falls back to python
+ground types, which are a little bit slower. You can manually
+choose python ground types even if gmpy is installed (e.g., for testing purposes).
+
+Note that sympy ground types are not supported, and should be used
+only for experimental purposes.
+
+Note that the gmpy1 ground type is primarily intended for testing; it the
+use of gmpy even if gmpy2 is available.
+
+This is the same as setting the environment variable
+SYMPY_GROUND_TYPES to the given ground type (e.g.,
+SYMPY_GROUND_TYPES='gmpy')
+
+The ground types can be determined interactively from the variable
+sympy.polys.domains.GROUND_TYPES inside the isympy shell itself.
+
+Example: isympy -t python
+
+\fITYPE\fR must be one of 'gmpy',
+\&'gmpy1' or 'python'.
+.TP
+\*(T<\fB\-o \fR\*(T>\fIORDER\fR, \*(T<\fB\-\-order=\fR\*(T>\fIORDER\fR
+Setup the ordering of terms for printing. The default is lex, which
+orders terms lexicographically (e.g., x**2 + x + 1). You can choose
+other orderings, such as rev-lex, which will use reverse
+lexicographic ordering (e.g., 1 + x + x**2).
+
+Note that for very large expressions, ORDER='none' may speed up
+printing considerably, with the tradeoff that the order of the terms
+in the printed expression will have no canonical order
+
+Example: isympy -o rev-lax
+
+\fIORDER\fR must be one of 'lex', 'rev-lex', 'grlex',
+\&'rev-grlex', 'grevlex', 'rev-grevlex', 'old', or 'none'.
+.TP
+\*(T<\fB\-q\fR\*(T>, \*(T<\fB\-\-quiet\fR\*(T>
+Print only Python's and SymPy's versions to stdout at startup, and nothing else.
+.TP
+\*(T<\fB\-d\fR\*(T>, \*(T<\fB\-\-doctest\fR\*(T>
+Use the same format that should be used for doctests. This is
+equivalent to '\fIisympy -c python -p no\fR'.
+.TP
+\*(T<\fB\-C\fR\*(T>, \*(T<\fB\-\-no\-cache\fR\*(T>
+Disable the caching mechanism. Disabling the cache may slow certain
+operations down considerably. This is useful for testing the cache,
+or for benchmarking, as the cache can result in deceptive benchmark timings.
+
+This is the same as setting the environment variable SYMPY_USE_CACHE
+to 'no'.
+.TP
+\*(T<\fB\-a\fR\*(T>, \*(T<\fB\-\-auto\fR\*(T>
+Automatically create missing symbols. Normally, typing a name of a
+Symbol that has not been instantiated first would raise NameError,
+but with this option enabled, any undefined name will be
+automatically created as a Symbol. This only works in IPython 0.11.
+
+Note that this is intended only for interactive, calculator style
+usage. In a script that uses SymPy, Symbols should be instantiated
+at the top, so that it's clear what they are.
+
+This will not override any names that are already defined, which
+includes the single character letters represented by the mnemonic
+QCOSINE (see the "Gotchas and Pitfalls" document in the
+documentation). You can delete existing names by executing "del
+name" in the shell itself. You can see if a name is defined by typing
+"'name' in globals()".
+
+The Symbols that are created using this have default assumptions.
+If you want to place assumptions on symbols, you should create them
+using symbols() or var().
+
+Finally, this only works in the top level namespace. So, for
+example, if you define a function in isympy with an undefined
+Symbol, it will not work.
+.TP
+\*(T<\fB\-D\fR\*(T>, \*(T<\fB\-\-debug\fR\*(T>
+Enable debugging output. This is the same as setting the
+environment variable SYMPY_DEBUG to 'True'. The debug status is set
+in the variable SYMPY_DEBUG within isympy.
+.TP
+-- \fIPYTHONOPTIONS\fR
+These options will be passed on to \fIipython (1)\fR shell.
+Only supported when ipython is being used (standard python shell not supported).
+
+Two dashes (--) are required to separate \fIPYTHONOPTIONS\fR
+from the other isympy options.
+
+For example, to run iSymPy without startup banner and colors:
+
+isympy -q -c ipython -- --colors=NoColor
+.TP
+\*(T<\fB\-h\fR\*(T>, \*(T<\fB\-\-help\fR\*(T>
+Print help output and exit.
+.TP
+\*(T<\fB\-v\fR\*(T>, \*(T<\fB\-\-version\fR\*(T>
+Print isympy version information and exit.
+.SH FILES
+.TP
+\*(T<\fI${HOME}/.sympy\-history\fR\*(T>
+Saves the history of commands when using the python
+shell as backend.
+.SH BUGS
+The upstreams BTS can be found at \(lahttps://github.com/sympy/sympy/issues\(ra
+Please report all bugs that you find in there, this will help improve
+the overall quality of SymPy.
+.SH "SEE ALSO"
+\fBipython\fR(1), \fBpython\fR(1)
diff --git a/.venv2/share/man/man1/ttx.1 b/.venv2/share/man/man1/ttx.1
new file mode 100644
index 0000000..9a65edf
--- /dev/null
+++ b/.venv2/share/man/man1/ttx.1
@@ -0,0 +1,225 @@
+.Dd May 18, 2004
+.\" ttx is not specific to any OS, but contrary to what groff_mdoc(7)
+.\" seems to imply, entirely omitting the .Os macro causes 'BSD' to
+.\" be used, so I give a zero-width space as its argument.
+.Os \&
+.\" The "FontTools Manual" argument apparently has no effect in
+.\" groff 1.18.1. I think it is a bug in the -mdoc groff package.
+.Dt TTX 1 "FontTools Manual"
+.Sh NAME
+.Nm ttx
+.Nd tool for manipulating TrueType and OpenType fonts
+.Sh SYNOPSIS
+.Nm
+.Bk
+.Op Ar option ...
+.Ek
+.Bk
+.Ar file ...
+.Ek
+.Sh DESCRIPTION
+.Nm
+is a tool for manipulating TrueType and OpenType fonts.  It can convert
+TrueType and OpenType fonts to and from an
+.Tn XML Ns -based format called
+.Tn TTX .
+.Tn TTX
+files have a
+.Ql .ttx
+extension.
+.Pp
+For each
+.Ar file
+argument it is given,
+.Nm
+detects whether it is a
+.Ql .ttf ,
+.Ql .otf
+or
+.Ql .ttx
+file and acts accordingly: if it is a
+.Ql .ttf
+or
+.Ql .otf
+file, it generates a
+.Ql .ttx
+file; if it is a
+.Ql .ttx
+file, it generates a
+.Ql .ttf
+or
+.Ql .otf
+file.
+.Pp
+By default, every output file is created in the same directory as the
+corresponding input file and with the same name except for the
+extension, which is substituted appropriately.
+.Nm
+never overwrites existing files; if necessary, it appends a suffix to
+the output file name before the extension, as in
+.Pa Arial#1.ttf .
+.Ss "General options"
+.Bl -tag -width ".Fl t Ar table"
+.It Fl h
+Display usage information.
+.It Fl d Ar dir
+Write the output files to directory
+.Ar dir
+instead of writing every output file to the same directory as the
+corresponding input file.
+.It Fl o Ar file
+Write the output to
+.Ar file
+instead of writing it to the same directory as the
+corresponding input file.
+.It Fl v
+Be verbose.  Write more messages to the standard output describing what
+is being done.
+.It Fl a
+Allow virtual glyphs ID's on compile or decompile.
+.El
+.Ss "Dump options"
+The following options control the process of dumping font files
+(TrueType or OpenType) to
+.Tn TTX
+files.
+.Bl -tag -width ".Fl t Ar table"
+.It Fl l
+List table information.  Instead of dumping the font to a
+.Tn TTX
+file, display minimal information about each table.
+.It Fl t Ar table
+Dump table
+.Ar table .
+This option may be given multiple times to dump several tables at
+once.  When not specified, all tables are dumped.
+.It Fl x Ar table
+Exclude table
+.Ar table
+from the list of tables to dump.  This option may be given multiple
+times to exclude several tables from the dump.  The
+.Fl t
+and
+.Fl x
+options are mutually exclusive.
+.It Fl s
+Split tables.  Dump each table to a separate
+.Tn TTX
+file and write (under the name that would have been used for the output
+file if the
+.Fl s
+option had not been given) one small
+.Tn TTX
+file containing references to the individual table dump files.  This
+file can be used as input to
+.Nm
+as long as the referenced files can be found in the same directory.
+.It Fl i
+.\" XXX: I suppose OpenType programs (exist and) are also affected.
+Don't disassemble TrueType instructions.  When this option is specified,
+all TrueType programs (glyph programs, the font program and the
+pre-program) are written to the
+.Tn TTX
+file as hexadecimal data instead of
+assembly.  This saves some time and results in smaller
+.Tn TTX
+files.
+.It Fl y Ar n
+When decompiling a TrueType Collection (TTC) file,
+decompile font number
+.Ar n ,
+starting from 0.
+.El
+.Ss "Compilation options"
+The following options control the process of compiling
+.Tn TTX
+files into font files (TrueType or OpenType):
+.Bl -tag -width ".Fl t Ar table"
+.It Fl m Ar fontfile
+Merge the input
+.Tn TTX
+file
+.Ar file
+with
+.Ar fontfile .
+No more than one
+.Ar file
+argument can be specified when this option is used.
+.It Fl b
+Don't recalculate glyph bounding boxes.  Use the values in the
+.Tn TTX
+file as is.
+.El
+.Sh "THE TTX FILE FORMAT"
+You can find some information about the
+.Tn TTX
+file format in
+.Pa documentation.html .
+In particular, you will find in that file the list of tables understood by
+.Nm
+and the relations between TrueType GlyphIDs and the glyph names used in
+.Tn TTX
+files.
+.Sh EXAMPLES
+In the following examples, all files are read from and written to the
+current directory.  Additionally, the name given for the output file
+assumes in every case that it did not exist before
+.Nm
+was invoked.
+.Pp
+Dump the TrueType font contained in
+.Pa FreeSans.ttf
+to
+.Pa FreeSans.ttx :
+.Pp
+.Dl ttx FreeSans.ttf
+.Pp
+Compile
+.Pa MyFont.ttx
+into a TrueType or OpenType font file:
+.Pp
+.Dl ttx MyFont.ttx
+.Pp
+List the tables in
+.Pa FreeSans.ttf
+along with some information:
+.Pp
+.Dl ttx -l FreeSans.ttf
+.Pp
+Dump the
+.Sq cmap
+table from
+.Pa FreeSans.ttf
+to
+.Pa FreeSans.ttx :
+.Pp
+.Dl ttx -t cmap FreeSans.ttf
+.Sh NOTES
+On MS\-Windows and MacOS,
+.Nm
+is available as a graphical application to which files can be dropped.
+.Sh SEE ALSO
+.Pa documentation.html
+.Pp
+.Xr fontforge 1 ,
+.Xr ftinfo 1 ,
+.Xr gfontview 1 ,
+.Xr xmbdfed 1 ,
+.Xr Font::TTF 3pm
+.Sh AUTHORS
+.Nm
+was written by
+.An -nosplit
+.An "Just van Rossum" Aq just@letterror.com .
+.Pp
+This manual page was written by
+.An "Florent Rougon" Aq f.rougon@free.fr
+for the Debian GNU/Linux system based on the existing FontTools
+documentation.  It may be freely used, modified and distributed without
+restrictions.
+.\" For Emacs:
+.\" Local Variables:
+.\" fill-column: 72
+.\" sentence-end: "[.?!][]\"')}]*\\($\\| $\\|   \\|  \\)[   \n]*"
+.\" sentence-end-double-space: t
+.\" End:
\ No newline at end of file
diff --git a/MEETING_PULL_SUMMARY_2026-03-25.md b/MEETING_PULL_SUMMARY_2026-03-25.md
new file mode 100644
index 0000000..16d548c
--- /dev/null
+++ b/MEETING_PULL_SUMMARY_2026-03-25.md
@@ -0,0 +1,171 @@
+# Pull Discussion Summary
+
+Comparison basis:
+- Local working tree on `main` at `bf2d9e2`
+- Upstream `origin/main` at `d4a40be`
+- Date: 2026-03-25
+
+This note summarizes the main code changes relative to the current upstream `main`, why they were made, and what still needs discussion before deciding whether to pull this work.
+
+## What Changed at a High Level
+
+- The codebase was moved from mostly global storage assumptions to a project-aware model.
+- Ingestion, retrieval, logs, graph snapshots, and audit outputs can now be scoped to a selected document folder.
+- The Gradio app was updated to work with that project-aware model instead of assuming one global storage location.
+- Graph pickle support was restored and split into two roles:
+  - `kg.pkl` for the editable working graph in the UI
+  - `kg_retrieval.pkl` for the retrieval snapshot built from canonical storage
+- Structured JSON logging was added for question-answer interactions.
+- Extraction was extended to use language information from documents and chunks.
+- Clean-ingestion stability on this Windows/Python 3.12 environment was improved by keeping retrieval operational even when native Chroma vector operations are unstable.
+
+## Project-Aware Ingestion and Storage
+
+The main architectural change is that a selected document folder is now treated as a project root for artifacts.
+
+Indented example:
+
+```text
+C:\path\to\documents\
+  report.pdf
+  notes.txt
+  .appl-kgraph\
+    storage\
+      documents.sqlite
+      chunks.sqlite
+      graph.sqlite
+      chroma_chunks\
+      chroma_entities\
+      chroma_relations\
+    knowledge_graph\
+      kg.pkl
+      kg_retrieval.pkl
+    logs\
+      ingestion.log
+      pathrag.log
+      lightrag.log
+      qa\
+        20260325T....json
+    audits\
+      extraction\
+        report.audit.json
+```
+
+Why this matters:
+- Separate corpora no longer have to share one global storage directory.
+- The UI can later select a project cleanly because the storage layout already supports it.
+- Logs, graph artifacts, and audits stay attached to the document set that produced them.
+
+## Most Important Code Changes and Reasons
+
+- `graph/project_paths.py`
+  - Added a single resolver for turning a document folder into project-local storage, logs, audits, and knowledge-graph paths.
+  - This is the main seam that makes per-project isolation work.
+
+- `graph/ingestion.py`
+  - Ingestion now accepts a project/document root and writes to project-local storage.
+  - Added structured progress messages for the UI during ingestion.
+  - Added retrieval-snapshot writing after ingestion.
+  - Added hooks for audit output and more explicit per-file processing stages.
+
+- `graph/app.py`
+  - The Gradio app now follows the project-aware storage model.
+  - Ingestion status is streamed live in the sidebar instead of only showing a final result.
+  - The knowledge graph rendering was fixed to use an iframe wrapper so the PyVis graph actually displays.
+  - The earlier graph editing tools were restored:
+    - save working graph
+    - load saved graph
+    - edit node
+    - merge nodes
+  - The app now distinguishes between:
+    - the editable working graph pickle
+    - the retrieval graph snapshot
+
+- `graph/graph_pickle.py`
+  - Added helper functions for loading, saving, and rebuilding graph pickles from canonical storage.
+  - This keeps pickle logic out of the UI and retriever code.
+
+- `graph/pathrag.py` and `graph/lightrag.py`
+  - Retrieval now loads the graph from the project-specific retrieval snapshot when available.
+  - Both retrievers now log question-answer interactions to structured JSON files.
+  - Retrieval was adjusted to work with the project-aware storage model instead of a single global storage root.
+
+- `graph/query_logging.py`
+  - Added JSON query logging for QA sessions.
+  - This is useful for debugging, auditability, and reviewing retrieval context after the fact.
+
+- `graph/fileparser.py`
+  - Text-file parsing was changed to avoid depending on `chardet` for normal `.txt` ingestion.
+  - This fixed a real failure seen while ingesting `docs2\UDHR_first_article_all.txt`.
+
+- `graph/extractor.py`, `graph/prompts.py`, and `graph/utils.py`
+  - Extraction prompting now has better language handling and cleaner prompt wiring.
+  - Utility and prompt behavior is more centralized than before.
+
+- `graph/db_storage.py`
+  - Storage was extended to support project-aware paths cleanly.
+  - A SQL-backed similarity fallback was added for environments where native Chroma vector operations are unstable.
+  - On the current Windows/Python 3.12 setup, that fallback is now also used to avoid clean-ingestion crashes during Chroma vector writes.
+
+- `graph/chunker.py`
+  - Overlap behavior was tightened so it does not keep compounding prior overlap unintentionally.
+
+- `graph/settings.py`
+  - Settings were expanded for project layout, logging split, and extraction toggles.
+  - This is one of the larger refactors in the diff and is functionally useful, but also one of the noisier files to review.
+
+- `graph/logging_utils.py`
+  - Added centralized file logger setup so ingestion and retrieval logs can be scoped per project.
+
+- Tests
+  - Added coverage for project-path resolution, graph pickle helpers, and chunk overlap behavior.
+  - Repaired the existing PathRAG storage adapter test.
+
+## New Files Added
+
+- `graph/project_paths.py`
+- `graph/query_logging.py`
+- `graph/graph_pickle.py`
+- `graph/logging_utils.py`
+- `test/test_project_paths.py`
+- `test/test_graph_pickle.py`
+- `test/test_chunker.py`
+
+## Why the Changes Are Good
+
+- The system is easier to reason about because project selection now maps directly to storage layout.
+- The UI and backend are more aligned than before.
+- Retrieval and ingestion leave behind much better operational traces through logs, QA JSON files, and graph snapshots.
+- The graph editing workflow was preserved instead of being lost during the project-scoping changes.
+- The codebase is better positioned for future UI project selection without another storage redesign.
+
+## Open Issues and Discussion Points
+
+- Native Chroma stability on Windows/Python 3.12
+  - On this machine, native Chroma query and native Chroma vector writes both proved unstable.
+  - The current solution keeps the app working by relying on SQL-backed similarity instead.
+  - This is practical, but it is still a workaround and should be discussed explicitly.
+
+- Performance implications of the fallback
+  - The SQL-backed similarity path is slower than native Chroma ANN search on larger corpora.
+  - For current functionality it works, but it is not the ideal long-term path if Chroma can be stabilized.
+
+- Duplicate basename handling during ingestion
+  - Some ingestion logic still keys documents by filename rather than full filepath.
+  - Two files with the same name in different subfolders of one project can still collide.
+
+- Second-pass extraction behavior
+  - There is audit support, but second-pass findings are still not cleanly positioned as graph-augmenting extraction.
+  - This should be clarified before calling the extraction flow final.
+
+- Working graph vs canonical graph
+  - `kg.pkl` is still a working UI artifact.
+  - UI graph edits do not currently sync back into canonical retrieval storage.
+  - That separation is intentional, but it should be understood by everyone discussing the pull.
+
+- Size and reviewability of the diff
+  - `app.py`, `settings.py`, `extractor.py`, and `db_storage.py` carry a large share of the churn.
+  - Functionally, many of the changes are reasonable.
+  - Review-wise, this may still be easier to land if split into smaller pull requests.
+
+#
diff --git a/README.md b/README.md
index 3a6ba9a..a8740dd 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # appl-kgraph
 
+This project is being developed with **Python 3.12** as its intended local environment and runtime baseline.
+
 **appl-kgraph** is a modular **graph-based Retrieval-Augmented Generation (RAG)** system with pluggable retrieval strategies. It is designed for querying large document collections by combining classic vector-based retrieval with an explicit **knowledge graph** over entities and relations extracted from the source texts.
 
 The project builds on ideas from recent graph-based RAG research, most notably **LightRAG** and **PathRAG**, and provides a shared graph + vector indexing layer on top of which multiple retrieval strategies can be implemented, compared, and extended.
@@ -129,18 +131,20 @@ If your document collection includes `.docx` files, ensure that **Microsoft Word
 
 1. Open a terminal (Anaconda Prompt or standard shell)
 2. Navigate to the project root
-3. Create a virtual environment:
+3. Verify that your active interpreter is Python 3.12 before installing dependencies.
+4. Create a virtual environment with Python 3.12:
 
+   ```text
+   Windows: py -3.12 -m venv venv
+   macOS/Linux: python3.12 -m venv venv
    ```
-   python -m venv venv
-   ```
-4. Activate it:
+5. Activate it:
 
    * Windows: `venv\Scripts\activate`
    * macOS/Linux: `source venv/bin/activate`
-5. Install dependencies:
+6. Install dependencies:
 
-   ```
+   ```text
    pip install -r requirements.txt
    ```
 
@@ -150,8 +154,9 @@ If your document collection includes `.docx` files, ensure that **Microsoft Word
 
 From the project root, run:
 
-```
-python graph/main.py
+```text
+Windows: py -3.12 graph/main.py
+macOS/Linux: python3.12 graph/main.py
 ```
 
 This will execute the current end-to-end pipeline using the configured retrieval strategy.
@@ -177,5 +182,3 @@ For questions, feedback, or collaboration inquiries, you can contact the maintai
 
 📧 <a href='mailto:stefan.troost@pbl.nl, k.wittenberg@scp.nl'>Contact link</a>
 
-
-
diff --git a/docs/.appl-kgraph/knowledge_graph/kg.pkl b/docs/.appl-kgraph/knowledge_graph/kg.pkl
new file mode 100644
index 0000000..42fb95d
Binary files /dev/null and b/docs/.appl-kgraph/knowledge_graph/kg.pkl differ
diff --git a/docs/.appl-kgraph/knowledge_graph/kg_retrieval.pkl b/docs/.appl-kgraph/knowledge_graph/kg_retrieval.pkl
new file mode 100644
index 0000000..3b534f5
Binary files /dev/null and b/docs/.appl-kgraph/knowledge_graph/kg_retrieval.pkl differ
diff --git a/docs2/.appl-kgraph/knowledge_graph/kg.pkl b/docs2/.appl-kgraph/knowledge_graph/kg.pkl
new file mode 100644
index 0000000..f9d2ee5
Binary files /dev/null and b/docs2/.appl-kgraph/knowledge_graph/kg.pkl differ
diff --git a/docs2/.appl-kgraph/knowledge_graph/kg_retrieval.pkl b/docs2/.appl-kgraph/knowledge_graph/kg_retrieval.pkl
new file mode 100644
index 0000000..e3da44d
Binary files /dev/null and b/docs2/.appl-kgraph/knowledge_graph/kg_retrieval.pkl differ
diff --git a/docs_store/UDHR_first_article_all.txt b/docs2/UDHR_first_article_all.txt
similarity index 100%
rename from docs_store/UDHR_first_article_all.txt
rename to docs2/UDHR_first_article_all.txt
diff --git a/docs2/meeting-change-summary-2026-03-18.pdf b/docs2/meeting-change-summary-2026-03-18.pdf
new file mode 100644
index 0000000..b9be681
Binary files /dev/null and b/docs2/meeting-change-summary-2026-03-18.pdf differ
diff --git a/graph/app.py b/graph/app.py
index 46e54c9..c7ce506 100644
--- a/graph/app.py
+++ b/graph/app.py
@@ -1,713 +1,570 @@
-import asyncio
+from __future__ import annotations
+
+import html
+import queue
+import tempfile
+import threading
+from pathlib import Path
+from typing import Any, Iterator, List, Optional, Tuple
+
 import gradio as gr
+import matplotlib.colors as mcolors
+import matplotlib.pyplot as plt
 import networkx as nx
 from pyvis.network import Network
-import os, json, pickle, random, glob, tempfile, base64
-from typing import Optional, List, Tuple
-import matplotlib.pyplot as plt
-import matplotlib.colors as mcolors
-from pathlib import Path
-from win32api import GetSystemMetrics
-from gradio import themes
-# local imports
-from fileparser import FileParser
-from ingestion_app import ingest_paths
-from pathrag import PathRAG, render_full_context
-from lightrag import LightRAG, render_full_context
-import settings
+
+from ingestion import ingest_paths
+from lightrag import LightRAG
+from pathrag import PathRAG, StorageAdapter as PathStorageAdapter
+from graph_pickle import load_graph_from_pickle, save_graph_to_pickle
+from project_paths import (
+    ProjectPaths,
+    ensure_project_dirs,
+    list_document_paths,
+    resolve_project_paths,
+)
 
 # =====================================================
 # -------- INITIAL RANDOM GRAPH CREATION --------------
 # =====================================================
+
 mygraph = nx.Graph()
-# COLOR_MODE = "type"
+_PATHRAG_CACHE: dict[str, PathRAG] = {}
+_LIGHTRAG_CACHE: dict[str, LightRAG] = {}
 
 # =====================================================
 # -------- DYNAMIC COLOR GENERATION -------------------
 # =====================================================
 def generate_dynamic_type_colors(graph):
     types = sorted(set(data.get("type", "Unknown") for _, data in graph.nodes(data=True)))
+    if not types:
+        return {}
     cmap = plt.get_cmap("tab20", len(types))
-    return {t: mcolors.to_hex(cmap(i)) for i, t in enumerate(types)}
+    return {node_type: mcolors.to_hex(cmap(index)) for index, node_type in enumerate(types)}
 
-SOURCE_COLORS = {
-    "User": "#17becf", "System": "#bcbd22", "External": "#e377c2", "Default": "#7f7f7f"
-}
-
-# =====================================================
-# -------- GRAPH RENDERING ----------------------------
-# =====================================================
-# def render_graph_iframe(graph, color_mode="type", height_px=600):
-def render_graph_iframe(graph, height_px=600):
-    TYPE_COLORS = generate_dynamic_type_colors(graph)
-    
-    # net = Network(height=f"{height_px}px", width="100%", directed=False, bgcolor="#111111", select_menu=True, filter_menu=True)
-    net = Network(height=f"{height_px}px", width="100%", directed=False, bgcolor="#111111")
-    # net.set_options("""
-    #     var options = {
-    #         "nodes": {"font": {"color": "white","size":10,"face":"arial"}, "borderWidth": 2},
-    #         "edges": {"color":{"color":"#AAAAAA"}, "smooth": false}
-    #     }
-    # """)
-
-    # # Important: disable physics BEFORE adding the graph
-    # net.barnes_hut()  # optional layout settings
-    # net.toggle_physics(False)
-
-    # Load the graph directly (fast)
-    graph_path = os.path.join("knowledge_graph", "kg.pkl")
-    with open(graph_path, 'rb') as f:
-        graph = pickle.load(f)
-    net.from_nx(graph)
-
-    # for node, data in graph.nodes(data=True):
-    #     node_label = data.get("label", "Unknown")
-    #     node_type = data.get("type", "Unknown")
-    #     node_source = data.get("source", "Unknown")
-    #     node_description = data.get("description", "Unknown")
-    #     # if color_mode == "type":
-    #     color = TYPE_COLORS.get(node_type, "#7f7f7f")
-    #     # else:
-    #     #     color = SOURCE_COLORS.get(node_source, SOURCE_COLORS["Default"])
-    #     title = f"Label: {node_label}, Type: {node_type}, Source: {node_source}, Description: {node_description}"
-    #     net.add_node(node, label=data.get("label", node), title=title, color=color)
-
-    # for u, v, data in graph.edges(data=True):
-    #     rel = data.get("relation_type", "relation")
-    #     desc = data.get("relation_description", "")
-    #     weight = data.get("weight", 1)
-    #     net.add_edge(u, v, title=f"{rel}, Description: {desc}, Weight: {weight}", value=weight)
-
-    net.set_options("""
-        var options = {
-            "nodes": {
-                "borderWidth": 0,
-                "borderWidthSelected": 4,
-                "opacity": 1,
-                "fixed": {
-                    "x": true,
-                    "y": true
-                },
-                "font": {
-                    "strokeWidth": 10
-                },
-                "size": 14
-            },
-            "edges": {
-                "arrows": {
-                    "middle": {
-                        "enabled": true
-                    }
-                },
-                "selfReferenceSize": null,
-                "selfReference": {
-                    "angle": 0.7853981633974483
-                },
-                "smooth": {
-                    "forceDirection": "none"
-                }
-            },
-            "interaction": {
-                "hover": true,
-                "multiselect": true,
-                "navigationButtons": true
-            },
-            "manipulation": {
-                "enabled": true,
-                "initiallyActive": true
-            },
-            "physics": {
-                "enabled": false,
-                "minVelocity": 0.75
-            }
-        }
-    """)
-
-    # net.show_buttons(filter_=['physics'])
-    # net.show_buttons()
-
-    # net.from_nx(graph)
-    # # Define the custom JS code for hover effect
-    # hover_code = """
-    #     var highlighted = [];
-    #     var neighbors = [];
-        
-    #     // Event for hovering over nodes
-    #     network.on('hoverNode', function(params) {
-    #         var node_id = params.node;
-            
-    #         // Get the node data
-    #         highlighted = [];
-    #         neighbors = [];
-            
-    #         // Highlight the hovered node
-    #         highlighted.push(node_id);
-            
-    #         // Get neighbors of the hovered node
-    #         var connectedNodes = network.getConnectedNodes(node_id);
-    #         neighbors = connectedNodes;
-
-    #         // Style update for highlighting nodes
-    #         network.nodes.forEach(function(node) {
-    #             if (highlighted.includes(node.id)) {
-    #                 node.color = {background: 'orange', border: 'black'};
-    #                 node.font = {color: 'white'};
-    #             } else if (neighbors.includes(node.id)) {
-    #                 node.color = {background: 'yellow', border: 'black'};
-    #             } else {
-    #                 node.color = {background: 'gray', border: 'gray'};
-    #                 node.font = {color: 'gray'};
-    #             }
-    #         });
-            
-    #         // Style update for edges
-    #         network.edges.forEach(function(edge) {
-    #             if (highlighted.includes(edge.from) && highlighted.includes(edge.to)) {
-    #                 edge.color = 'orange';
-    #                 edge.width = 4;
-    #             } else if (neighbors.includes(edge.from) || neighbors.includes(edge.to)) {
-    #                 edge.color = 'yellow';
-    #                 edge.width = 2;
-    #             } else {
-    #                 edge.color = 'gray';
-    #                 edge.width = 1;
-    #             }
-    #         });
-
-    #         // Refresh the network to apply the styles
-    #         network.redraw();
-    #     });
-        
-    #     // Reset on mouseout
-    #     network.on('blurNode', function() {
-    #         network.nodes.forEach(function(node) {
-    #             node.color = {background: 'lightblue', border: 'black'};
-    #             node.font = {color: 'black'};
-    #         });
-            
-    #         network.edges.forEach(function(edge) {
-    #             edge.color = 'black';
-    #             edge.width = 2;
-    #         });
-            
-    #         network.redraw();
-    #     });
-    # """
-
-    # # Define the custom JS code for click effect
-    # click_code = """
-    #     var highlighted = [];
-    #     var neighbors = [];
-        
-    #     // Event for clicking on nodes
-    #     network.on('selectNode', function(params) {
-    #         var node_id = params.nodes[0];
-            
-    #         // Get the node data
-    #         highlighted = [];
-    #         neighbors = [];
-            
-    #         // Highlight the clicked node
-    #         highlighted.push(node_id);
-            
-    #         // Get neighbors of the clicked node
-    #         var connectedNodes = network.getConnectedNodes(node_id);
-    #         neighbors = connectedNodes;
-
-    #         // Style update for highlighting nodes
-    #         network.nodes.forEach(function(node) {
-    #             if (highlighted.includes(node.id)) {
-    #                 node.color = {background: 'orange', border: 'black'};
-    #                 node.font = {color: 'white'};
-    #             } else if (neighbors.includes(node.id)) {
-    #                 node.color = {background: 'yellow', border: 'black'};
-    #             } else {
-    #                 node.color = {background: 'gray', border: 'gray'};
-    #                 node.font = {color: 'gray'};
-    #             }
-    #         });
-            
-    #         // Style update for edges
-    #         network.edges.forEach(function(edge) {
-    #             if (highlighted.includes(edge.from) && highlighted.includes(edge.to)) {
-    #                 edge.color = 'orange';
-    #                 edge.width = 4;
-    #             } else if (neighbors.includes(edge.from) || neighbors.includes(edge.to)) {
-    #                 edge.color = 'yellow';
-    #                 edge.width = 2;
-    #             } else {
-    #                 edge.color = 'gray';
-    #                 edge.width = 1;
-    #             }
-    #         });
-
-    #         // Refresh the network to apply the styles
-    #         network.redraw();
-    #     });
-
-    #     // Reset the styling when clicking anywhere outside a node
-    #     network.on('deselectNode', function() {
-    #         network.nodes.forEach(function(node) {
-    #             node.color = {background: 'lightblue', border: 'black'};
-    #             node.font = {color: 'black'};
-    #         });
-            
-    #         network.edges.forEach(function(edge) {
-    #             edge.color = 'black';
-    #             edge.width = 2;
-    #         });
-            
-    #         network.redraw();
-    #     });
-    # """
-
-
-    # # Add the click event script
-    # net.set_options(click_code)
-
-    # net.show("network_click.html")
-    tmp_path = os.path.join(tempfile.gettempdir(), "graph.html")
-    net.save_graph(tmp_path)
-    with open(tmp_path, "r", encoding="utf-8") as f:
-        html_content = f.read()
-    b64 = base64.b64encode(html_content.encode("utf-8")).decode("utf-8")
-    return f'<iframe src="data:text/html;base64,{b64}" style="width:100%; height:{height_px}px; border:none;"></iframe>'
 
-# =====================================================
-# -------- GRAPH INTERACTION --------------------------
-# =====================================================
-# Function to handle node click event
-def node_click(node_id):
-    # Return node details
-    node_details = {
-        1: "Node 1: This is Node 1\nColor: Red\nSize: 15",
-        2: "Node 2: This is Node 2\nColor: Green\nSize: 15",
-        3: "Node 3: This is Node 3\nColor: Blue\nSize: 15"
-    }
-    
-    # Return the details of the clicked node (or a default message if node not found)
-    return node_details.get(node_id, "Node details not found.")
-
-# =====================================================
-# -------- LEGEND -------------------------------------
-# =====================================================
-# def generate_legend_html(color_mode="type", graph=None):
-def generate_legend_html(graph=None):
-    # if color_mode == "type":
-    COLORS = generate_dynamic_type_colors(graph)
-    # else:
-    #     COLORS = SOURCE_COLORS
+def generate_legend_html(graph: Optional[nx.Graph] = None) -> str:
+    colors = generate_dynamic_type_colors(graph or nx.Graph())
     html = "<div style='padding:5px;'><b>Legend:</b><br>"
-    for key, color in COLORS.items():
-        html += f"<div style='display:flex;align-items:center;margin:2px;'>"
-        html += f"<div style='width:20px;height:20px;background-color:{color};margin-right:5px;border:1px solid #fff;'></div>{key}</div>"
+    for key, color in colors.items():
+        html += "<div style='display:flex;align-items:center;margin:2px;'>"
+        html += (
+            f"<div style='width:20px;height:20px;background-color:{color};"
+            "margin-right:5px;border:1px solid #fff;'></div>"
+            f"{key}</div>"
+        )
     html += "</div>"
     return html
 
-# =====================================================
-# -------- NODE OPERATIONS ----------------------------
-# =====================================================
-# def merge_nodes(node1, node2, color_mode):
-def merge_nodes(node1, node2):
-    if node1 not in mygraph or node2 not in mygraph:
-        # return render_graph_iframe(mygraph, color_mode), generate_legend_html(color_mode, mygraph), "⚠️ Both nodes must exist."
-        return render_graph_iframe(mygraph), generate_legend_html(mygraph), "⚠️ Both nodes must exist."
-    if node1 == node2:
-        # return render_graph_iframe(mygraph, color_mode), generate_legend_html(color_mode, mygraph), "⚠️ Cannot merge same node."
-        return render_graph_iframe(mygraph), generate_legend_html(mygraph), "⚠️ Cannot merge same node."
-    # define new node: labels are concatenated
-    new_node = f"{node1}_{node2}"
-    mygraph.add_node(new_node, label=f"{mygraph.nodes[node1].get('label','')} + {mygraph.nodes[node2].get('label','')}", type="Merged", description=f"Merged from {node1} and {node2}", source="Merged")
-    # after creating the new merged node, assign the old edges to the new node and delete the old nodes
-    for n in [node1, node2]:
-        for neighbor, attrs in list(mygraph[n].items()):
-            if neighbor not in [node1, node2]:
-                mygraph.add_edge(new_node, neighbor, **attrs)
-        mygraph.remove_node(n)
-    # return render_graph_iframe(mygraph, color_mode), generate_legend_html(color_mode, mygraph), f"🔄 Merged '{node1}'+'{node2}' → '{new_node}'."
-    return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"🔄 Merged '{node1}'+'{node2}' → '{new_node}'."
 
+def _project_paths_for_folder(folder_path: str) -> Optional[ProjectPaths]:
+    if not folder_path:
+        return None
+    return resolve_project_paths(folder_path)
 
-# def update_node_attributes(node_id, new_label, new_type, new_desc, new_source, color_mode):
-def update_node_attributes(node_id, new_label, new_type, new_desc, new_source):
-    if node_id not in mygraph:
-        # return render_graph_iframe(mygraph, color_mode), generate_legend_html(color_mode, mygraph), f"⚠️ Node '{node_id}' not found."
-        return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"⚠️ Node '{node_id}' not found."
-    if new_label:
-        mygraph.nodes[node_id]['label'] = new_label
-    if new_type:
-        mygraph.nodes[node_id]['type'] = new_type
-    if new_desc:
-        mygraph.nodes[node_id]['description'] = new_desc
-    if new_source:
-        mygraph.nodes[node_id]['source'] = new_source
-    # return render_graph_iframe(mygraph, color_mode), generate_legend_html(color_mode, mygraph), f"✏️ Node '{node_id}' updated."
-    return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"✏️ Node '{node_id}' updated."
 
-# =====================================================
-# -------- SAVE GRAPH ---------------------------------
-# =====================================================
-# def save_graph_json():
-#     data = {
-#         "nodes": [dict(id=n, **d) for n, d in G.nodes(data=True)],
-#         "links": [dict(source=u, target=v, **d) for u, v, d in G.edges(data=True)]
-#     }
-#     with open("graph.json", "w", encoding="utf-8") as f:
-#         json.dump(data, f, indent=2)
-#     return "✅ Graph saved as graph.json"
-
-def save_graph_pickle():
-    with open("graph.pkl", "wb") as f:
-        pickle.dump(mygraph, f)
-    return "✅ Graph saved as graph.pkl"
-
-
-def handle_ingestion(folder_path):
-    # =====================================================
-    # -------- FOLDER-BASED DOCUMENT SELECTION ------------
-    # =====================================================
-    # Initialize an empty list to store status messages
-    status_messages = []
-    if not folder_path or not os.path.isdir(folder_path):
-        yield None, "⚠️ Please enter a valid folder path."
-    else:
-        paths = [Path(os.path.join(folder_path, f)) for f in os.listdir(folder_path) 
-                 if ((os.path.isfile(os.path.join(folder_path, f))) and (Path(f).suffix in settings.VALID_EXTENSIONS))]
-        if not paths:
-            yield None, f"📂 No files found with extensions: {', '.join(settings.VALID_EXTENSIONS)}."
-        else:
-            # Iterate over the generator returned by process_files
-            for graph, status in ingest_paths(paths):
-                status_messages.append(status)
-                # Yield the current status messages
-                yield graph, "\n".join(status_messages)
-
-
-async def create_pathrag_response(question: str, chat_history: List[gr.ChatMessage]) -> Tuple[str, List[gr.ChatMessage], str]:
-    """
-    A response is obtained from Pathrag knowledge graph retrieval
-
-    Parameters
-    ----------
-    question : str
-        the currently user submitted question
-    chat_history : List[gr.ChatMessage]
-        a list of historical chatmessages, alternating between user and assistant
-
-    Returns
-    -------
-    List[gr.ChatMessage]
-        the updated list of historical chatmessages, alternating between user and assistant
-    """
-    chat_history.append(gr.ChatMessage(role="user", content=question))
-    pathrag = PathRAG(
-        system_prompt=""
+def _load_graph_from_storage(folder_path: str) -> nx.Graph:
+    if not folder_path:
+        return nx.Graph()
+    project_paths = resolve_project_paths(folder_path)
+    if not Path(project_paths.storage.graph_db).exists():
+        return nx.Graph()
+    adapter = PathStorageAdapter(paths=project_paths.storage)
+    return adapter.graph.copy()
+
+
+def _load_graph_from_pickle(folder_path: str) -> nx.Graph:
+    project_paths = _project_paths_for_folder(folder_path)
+    if project_paths is None or not project_paths.graph_pickle_file.exists():
+        return nx.Graph()
+    return load_graph_from_pickle(project_paths.graph_pickle_file)
+
+
+def _save_graph_pickle(folder_path: str, graph: nx.Graph) -> Optional[Path]:
+    project_paths = _project_paths_for_folder(folder_path)
+    if project_paths is None:
+        return None
+    ensure_project_dirs(project_paths)
+    return save_graph_to_pickle(graph, project_paths.graph_pickle_file)
+
+
+def render_graph_iframe(graph: nx.Graph, height_px: int = 650) -> str:
+    type_colors = generate_dynamic_type_colors(graph)
+    net = Network(height=f"{height_px}px", width="100%", directed=False, bgcolor="#111111", font_color="white")
+    for node, data in graph.nodes(data=True):
+        node_type = data.get("type", "unknown")
+        title = f"type={node_type}\n{data.get('description', '')}"
+        color = type_colors.get(node_type, "#0EA5E9")
+        label = str(data.get("label", node))
+        net.add_node(str(node), label=label, title=title, color=color)
+
+    for source, target, data in graph.edges(data=True):
+        title = data.get("description", "") or data.get("keywords", "")
+        net.add_edge(str(source), str(target), title=title, value=float(data.get("weight", 1.0) or 1.0))
+
+    net.set_options(
+        """
+        var options = {
+          "nodes": { "shape": "dot", "size": 14, "font": { "strokeWidth": 8 } },
+          "edges": { "smooth": false, "color": { "color": "#94A3B8" } },
+          "physics": { "enabled": false },
+          "interaction": { "hover": true, "navigationButtons": true }
+        }
+        """
     )
-    result = await pathrag.aretrieve(question)
-    
-    # print(f"Pathrag result = {result}")
-
-    # chat_history.append(gr.ChatMessage(role="assistant", content=result.answer, metadata={"title":  "Processing..."}))
-    chat_history.append(gr.ChatMessage(role="assistant", content=result.answer))
-    # print(f"chat_history = {chat_history}")
-    # make sure that the msg_input textbox is cleared again and return the updated chat history
-    lines = ""
-    if result.chunk_matches:
-        for i, chunk in enumerate(result.chunk_matches, 1):
-            head = f"{chunk.filename or chunk.document_id or '(unknown doc)'}"
-            lines += f"{i}. {head} (score={chunk.score:.3f})\n"
-            lines += f"{chunk.text}\n"
-            lines += 46*"-" + "\n\n"
-    # else:
-    #     lines.append("(none)")
-    
-    print()
-    return "", chat_history, lines
-
-
-async def create_lightrag_response(question: str, chat_history: List[gr.ChatMessage]) -> Tuple[str, List[gr.ChatMessage], str]:
-    """
-    A response is obtained from Lightrag knowledge graph retrieval
-
-    Parameters
-    ----------
-    question : str
-        the currently user submitted question
-    chat_history : List[gr.ChatMessage]
-        a list of historical chatmessages, alternating between user and assistant
-
-    Returns
-    -------
-    List[gr.ChatMessage]
-        the updated list of historical chatmessages, alternating between user and assistant
-    """
-    chat_history.append(gr.ChatMessage(role="user", content=question))
-    lightrag = LightRAG(
-        system_prompt=""
+
+    tmp_path = Path(tempfile.gettempdir()) / "appl_kgraph_graph.html"
+    net.save_graph(str(tmp_path))
+    rendered = tmp_path.read_text(encoding="utf-8")
+    return (
+        f'<iframe srcdoc="{html.escape(rendered, quote=True)}" '
+        f'style="width:100%;height:{height_px}px;border:none;border-radius:8px;" '
+        'sandbox="allow-scripts allow-same-origin"></iframe>'
     )
-    result = await lightrag.aretrieve(question)
-    
-    # print(f"Lightrag result = {result}")
-    
-    # chat_history.append(gr.ChatMessage(role="assistant", content=result.answer, metadata={"title":  "Processing..."}))
-    chat_history.append(gr.ChatMessage(role="assistant", content=result.answer))
-    # print(f"chat_history = {chat_history}")
-    # make sure that the msg_input textbox is cleared again and return the updated chat history
-    lines = ""
-    if result.all_chunks:
-        for i, chunk in enumerate(result.all_chunks, 1):
-            # head = f"{chunk.filename or chunk.document_id or '(unknown doc)'}"
-            # lines += f"{i}. {head} (score={chunk.score:.3f})\n"
-            lines += f"{i}. {chunk['source_type']}"
-            if chunk['source_type'] == "vector":
-                lines += f" (score={chunk['score']:.3f})"
-            lines += "\n"
-            lines += f"{chunk['text']}\n"
-            lines += 46*"-" + "\n\n"
-
-    return "", chat_history, lines
-
-# async def predict(input, history):
-#     """
-#     Predict the response of the chatbot and complete a running list of chat history.
-#     """
-#     history.append({"role": "user", "content": input})
-#     response = await create_pathrag_response(history)
-#     history.append({"role": "assistant", "content": response})
-#     messages = [(history[i]["content"], history[i+1]["content"]) for i in range(0, len(history)-1, 2)]
-#     return messages, history
-
-
-
-    # query = "Who are the authors of LayoutParser and do they overlap any of the other articles?"
-    # query = input("Enter your question: ")
-    # conversation_history = []
-    # while query not in ("exit", "quit"):
-    #     print("\n--- PathRAG Response ---\n")
-    #     asyncio.run(graph.main.ask_with_pathrag(query, verbose=True))
-    #     print("\n---\n")
-    #     print("\n--- LightRAG Response ---\n")
-    #     result = asyncio.run(graph.main.ask_with_lightrag(query, verbose=True))
-    #     conversation_history.append((query, result.answer))
-    #     print("\n---\n")
-    #     query = input("Enter your question: ")
-
-    # try:
-    #     file_path = os.path.join(folder_path, "knowledge_graphs", "azureopenai_gpt-4o_azureopenai_text-embedding-ada-002_NLTKTextSplitter_2000_200", "knowledge_graph.pkl")
-    #     with open(file_path, "rb") as f:
-    #         newG = pickle.load(f)
-    #     if not isinstance(newG, nx.Graph):
-    #         raise ValueError("Invalid pickle file.")
-    #     G.clear()
-    #     G.add_nodes_from(newG.nodes(data=True))
-    #     G.add_edges_from(newG.edges(data=True))
-    #     msg = f"📦 Loaded Pickle graph from {file_path}"
-    #     # return render_graph_iframe(G, color_mode), generate_legend_html(color_mode, G), msg
-    #     return render_graph_iframe(G), generate_legend_html(G), msg
-    # except Exception as e:
-    #     # return render_graph_iframe(G, color_mode), generate_legend_html(color_mode, G), f"❌ Load failed: {e}"
-    #     return render_graph_iframe(G), generate_legend_html(G), f"❌ Load failed: {e}"
 
-# =====================================================
-# -------- DROPDOWN UPDATE HELPER ---------------------
-# =====================================================
-# def update_dropdowns():
-#     nodes = sorted(G.nodes())
-#     return [gr.update(choices=nodes) for _ in range(9)]
+
+def _get_pathrag(folder_path: str) -> PathRAG:
+    rag = _PATHRAG_CACHE.get(folder_path)
+    if rag is None:
+        rag = PathRAG(project_paths=resolve_project_paths(folder_path), system_prompt="")
+        _PATHRAG_CACHE[folder_path] = rag
+    return rag
+
+
+def _get_lightrag(folder_path: str) -> LightRAG:
+    rag = _LIGHTRAG_CACHE.get(folder_path)
+    if rag is None:
+        rag = LightRAG(project_paths=resolve_project_paths(folder_path), system_prompt="")
+        _LIGHTRAG_CACHE[folder_path] = rag
+    return rag
+
+
+def _history_to_turns(chat_history: List[dict]) -> List[Tuple[str, str]]:
+    turns: List[Tuple[str, str]] = []
+    for message in chat_history or []:
+        if isinstance(message, dict):
+            role = message.get("role", "")
+            content = message.get("content", "")
+        else:
+            role = getattr(message, "role", "")
+            content = getattr(message, "content", "")
+        if role and content:
+            turns.append((role, content))
+    return turns
+
+
+def _dropdown_choices() -> List[Tuple[str, str]]:
+    choices: List[Tuple[str, str]] = []
+    for node, data in mygraph.nodes(data=True):
+        label = data.get("label", node)
+        choices.append((f"{label} ({node})", str(node)))
+    return sorted(choices, key=lambda item: item[0].lower())
+
+
 def update_dropdowns():
-    # Create dropdown entries with label → id mapping
-    nodes = sorted(mygraph.nodes())
-    labeled_nodes = [
-        (f"{data.get('label', n)} ({n})", n)
-        for n, data in mygraph.nodes(data=True)
+    choices = _dropdown_choices()
+    return [gr.update(choices=choices, value=None) for _ in range(3)]
+
+
+def _ingestion_payload(
+    graph: nx.Graph,
+    status: str,
+    active_folder_value: str,
+) -> Tuple[str, str, str, str, Any, Any, Any]:
+    updates = update_dropdowns()
+    return render_graph_iframe(graph), generate_legend_html(graph), status, active_folder_value, *updates
+
+
+def handle_ingestion(folder_path: str) -> Iterator[Tuple[str, str, str, str, Any, Any, Any]]:
+    global mygraph
+
+    if not folder_path or not Path(folder_path).is_dir():
+        yield _ingestion_payload(mygraph, "Please provide a valid folder path.", "")
+        return
+
+    documents_root = Path(folder_path).expanduser().resolve()
+    paths = list_document_paths(documents_root)
+    if not paths:
+        empty_graph = nx.Graph()
+        yield _ingestion_payload(
+            empty_graph,
+            "No supported files found in the selected folder.",
+            str(documents_root),
+        )
+        return
+
+    progress_messages = [
+        f"Preparing ingestion for {documents_root}",
+        f"Discovered {len(paths)} supported files",
     ]
-    return [gr.update(choices=labeled_nodes) for _ in range(3)]
+    yield _ingestion_payload(mygraph, "\n".join(progress_messages), str(documents_root))
+
+    progress_queue: queue.Queue[str] = queue.Queue()
+    outcome: dict[str, Any] = {}
+    error: dict[str, Exception] = {}
+
+    def _report_progress(message: str) -> None:
+        progress_queue.put(message)
+
+    def _run_ingestion() -> None:
+        try:
+            outcome["summary"] = ingest_paths(
+                paths,
+                documents_root=documents_root,
+                progress_callback=_report_progress,
+            )
+        except Exception as exc:  # pragma: no cover - UI integration guard
+            error["exception"] = exc
+        finally:
+            progress_queue.put("__DONE__")
+
+    worker = threading.Thread(target=_run_ingestion, daemon=True)
+    worker.start()
+
+    while True:
+        try:
+            message = progress_queue.get(timeout=0.2)
+        except queue.Empty:
+            continue
+
+        if message == "__DONE__":
+            break
+
+        progress_messages.append(message)
+        yield _ingestion_payload(mygraph, "\n".join(progress_messages[-50:]), str(documents_root))
+
+    if "exception" in error:
+        progress_messages.append(f"Error: {error['exception']}")
+        yield _ingestion_payload(mygraph, "\n".join(progress_messages[-50:]), str(documents_root))
+        return
+
+    summary = outcome["summary"]
+    mygraph = _load_graph_from_storage(str(documents_root))
+    _PATHRAG_CACHE.pop(str(documents_root), None)
+    _LIGHTRAG_CACHE.pop(str(documents_root), None)
+
+    pickle_note = ""
+    project_paths = resolve_project_paths(documents_root)
+    if not project_paths.graph_pickle_file.exists():
+        saved_path = _save_graph_pickle(str(documents_root), mygraph)
+        if saved_path is not None:
+            pickle_note = f"\nSaved baseline graph pickle to {saved_path}"
+    else:
+        pickle_note = f"\nExisting working graph pickle preserved at {project_paths.graph_pickle_file}"
+
+    status = (
+        f"Ingested project at {documents_root}\n"
+        f"Processed files: {summary['processed_files']}\n"
+        f"Skipped files: {summary['skipped_files']}\n"
+        f"Removed files: {summary['removed_files']}\n"
+        f"Chunks: {summary['chunk_count']}\n"
+        f"Entities: {summary['entity_count']}\n"
+        f"Relations: {summary['relation_count']}"
+        f"\nRetrieval snapshot: {project_paths.retrieval_graph_pickle_file}"
+        f"{pickle_note}\n\n"
+        f"{chr(10).join(progress_messages[-20:])}"
+    )
+    yield _ingestion_payload(mygraph, status, str(documents_root))
 
-def toggle_checkboxgroup(checkbox_value):
-    # Show the checkbox group if the checkbox is checked, hide it otherwise
-    return gr.update(visible=checkbox_value)
 
+def save_current_graph(folder_path: str) -> str:
+    if not folder_path:
+        return "Select and ingest a document folder first."
+    saved_path = _save_graph_pickle(folder_path, mygraph)
+    if saved_path is None:
+        return "Unable to determine a project path for the current graph."
+    return f"Saved working graph pickle to {saved_path}"
 
-# =====================================================
-# -------- GRADIO INTERFACE ---------------------------
-# =====================================================
-chathistory = []
-screenwidth = GetSystemMetrics(0)
-screenheight = GetSystemMetrics(1)
-
-my_theme=themes.Soft(primary_hue="blue",
-                     secondary_hue="gray",
-                     font=[themes.GoogleFont("Oxanium"), "Arial", "sans-serif"],
-                     spacing_size=themes.sizes.spacing_sm,
-                     text_size=themes.sizes.text_sm) 
-
-with gr.Blocks(theme="glass", css=".prompt {color: green}") as demo:
-# with gr.Blocks(theme=my_theme, css=".prompt {color: green}") as demo:
-# with gr.Blocks(theme=my_theme) as demo:
+
+def load_saved_graph(folder_path: str) -> Tuple[str, str, str, Any, Any, Any]:
+    global mygraph
+
+    if not folder_path:
+        updates = update_dropdowns()
+        return render_graph_iframe(mygraph), generate_legend_html(mygraph), "Select and ingest a document folder first.", *updates
+
+    project_paths = resolve_project_paths(folder_path)
+    if not project_paths.graph_pickle_file.exists():
+        updates = update_dropdowns()
+        return (
+            render_graph_iframe(mygraph),
+            generate_legend_html(mygraph),
+            f"No saved graph pickle found yet at {project_paths.graph_pickle_file}",
+            *updates,
+        )
+
+    try:
+        mygraph = _load_graph_from_pickle(folder_path)
+        message = f"Loaded working graph pickle from {project_paths.graph_pickle_file}"
+    except Exception as exc:
+        updates = update_dropdowns()
+        return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"Failed to load saved graph pickle: {exc}", *updates
+
+    updates = update_dropdowns()
+    return render_graph_iframe(mygraph), generate_legend_html(mygraph), message, *updates
+
+
+def merge_nodes(
+    node1: str,
+    node2: str,
+    active_folder: str,
+) -> Tuple[str, str, str, Any, Any, Any]:
+    global mygraph
+
+    if not node1 or not node2:
+        updates = update_dropdowns()
+        return render_graph_iframe(mygraph), generate_legend_html(mygraph), "Select two nodes to merge.", *updates
+    if node1 not in mygraph or node2 not in mygraph:
+        updates = update_dropdowns()
+        return render_graph_iframe(mygraph), generate_legend_html(mygraph), "Both nodes must exist in the current graph.", *updates
+    if node1 == node2:
+        updates = update_dropdowns()
+        return render_graph_iframe(mygraph), generate_legend_html(mygraph), "Cannot merge the same node into itself.", *updates
+
+    new_node = f"{node1}_{node2}"
+    suffix = 1
+    while new_node in mygraph:
+        suffix += 1
+        new_node = f"{node1}_{node2}_{suffix}"
+
+    label1 = mygraph.nodes[node1].get("label", node1)
+    label2 = mygraph.nodes[node2].get("label", node2)
+    mygraph.add_node(
+        new_node,
+        label=f"{label1} + {label2}",
+        type="Merged",
+        description=f"Merged from {node1} and {node2}",
+        source="Merged",
+    )
+
+    for original in (node1, node2):
+        for neighbor, attrs in list(mygraph[original].items()):
+            if neighbor != new_node and neighbor not in (node1, node2):
+                mygraph.add_edge(new_node, neighbor, **attrs)
+        mygraph.remove_node(original)
+
+    autosave = ""
+    saved_path = _save_graph_pickle(active_folder, mygraph)
+    if saved_path is not None:
+        autosave = f" Auto-saved to {saved_path}."
+
+    updates = update_dropdowns()
+    return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"Merged '{node1}' and '{node2}' into '{new_node}'.{autosave}", *updates
+
+
+def update_node_attributes(
+    node_id: str,
+    new_label: str,
+    new_type: str,
+    new_desc: str,
+    new_source: str,
+    active_folder: str,
+) -> Tuple[str, str, str, Any, Any, Any]:
+    global mygraph
+
+    if not node_id:
+        updates = update_dropdowns()
+        return render_graph_iframe(mygraph), generate_legend_html(mygraph), "Select a node to update.", *updates
+    if node_id not in mygraph:
+        updates = update_dropdowns()
+        return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"Node '{node_id}' was not found in the current graph.", *updates
+
+    if new_label:
+        mygraph.nodes[node_id]["label"] = new_label
+    if new_type:
+        mygraph.nodes[node_id]["type"] = new_type
+    if new_desc:
+        mygraph.nodes[node_id]["description"] = new_desc
+    if new_source:
+        mygraph.nodes[node_id]["source"] = new_source
+
+    autosave = ""
+    saved_path = _save_graph_pickle(active_folder, mygraph)
+    if saved_path is not None:
+        autosave = f" Auto-saved to {saved_path}."
+
+    updates = update_dropdowns()
+    return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"Updated node '{node_id}'.{autosave}", *updates
+
+
+async def create_pathrag_response(
+    question: str,
+    chat_history: List[dict],
+    active_folder: str,
+) -> Tuple[str, List[dict], str]:
+    if not active_folder:
+        chat_history = list(chat_history or [])
+        chat_history.append({"role": "assistant", "content": "Select and ingest a document folder first."})
+        return "", chat_history, ""
+
+    history = list(chat_history or [])
+    history.append({"role": "user", "content": question})
+    try:
+        rag = _get_pathrag(active_folder)
+        result = await rag.aretrieve(question, conversation_history=_history_to_turns(history[:-1]))
+        history.append({"role": "assistant", "content": result.answer})
+
+        sources = []
+        for index, chunk in enumerate(result.chunk_matches, start=1):
+            head = chunk.filename or chunk.document_id or "(unknown doc)"
+            sources.append(f"{index}. {head} (score={chunk.score:.3f})")
+            sources.append(chunk.text)
+            sources.append("-" * 46)
+
+        return "", history, "\n".join(sources)
+    except Exception as exc:
+        history.append({"role": "assistant", "content": f"PathRAG error: {exc}"})
+        return "", history, f"PathRAG error: {exc}"
+
+
+async def create_lightrag_response(
+    question: str,
+    chat_history: List[dict],
+    active_folder: str,
+) -> Tuple[str, List[dict], str]:
+    if not active_folder:
+        chat_history = list(chat_history or [])
+        chat_history.append({"role": "assistant", "content": "Select and ingest a document folder first."})
+        return "", chat_history, ""
+
+    history = list(chat_history or [])
+    history.append({"role": "user", "content": question})
+    try:
+        rag = _get_lightrag(active_folder)
+        result = await rag.aretrieve(question, conversation_history=_history_to_turns(history[:-1]))
+        history.append({"role": "assistant", "content": result.answer})
+
+        sources = []
+        for index, chunk in enumerate(result.all_chunks, start=1):
+            source_type = chunk.get("source_type", "unknown")
+            line = f"{index}. {source_type}"
+            if source_type == "vector" and chunk.get("score") is not None:
+                line += f" (score={float(chunk['score']):.3f})"
+            sources.append(line)
+            sources.append(chunk.get("text", ""))
+            sources.append("-" * 46)
+
+        return "", history, "\n".join(sources)
+    except Exception as exc:
+        history.append({"role": "assistant", "content": f"LightRAG error: {exc}"})
+        return "", history, f"LightRAG error: {exc}"
+
+
+with gr.Blocks() as demo:
     gr.Markdown("## Interactive Hybrid RAG")
 
-    #sidebar
+    active_folder = gr.State("")
+
     with gr.Sidebar():
-        # Folder and file(s) selection
-        folder_path_input = gr.Textbox(
-            label="Enter document folder (complete path)",
-            interactive=True,
-        )
-        # button to start ingestion process
-        go_btn = gr.Button(value="Load Knowledge Graph", variant="primary")
-
-        # Output status messages
-        status_messages = gr.Textbox(
-            label="Status",
-            interactive=False,
-            lines=8
-        )
+        folder_path_input = gr.Textbox(label="Document folder", placeholder="C:\\path\\to\\documents")
+        go_btn = gr.Button(value="Ingest Folder", variant="primary")
+        status_messages = gr.Textbox(label="Status", interactive=False, lines=16)
 
     with gr.Tabs():
         with gr.Tab("Chat"):
             with gr.Tabs():
-                with gr.Tab("PathRag"):
+                with gr.Tab("PathRAG"):
+                    pathrag_chatbot = gr.Chatbot(type="messages", label="PathRAG Chat History", height=420)
+                    pathrag_sources = gr.Textbox(label="PathRAG sources", interactive=False, lines=14)
                     with gr.Row():
-                        # pathrag chatbot component
-                        pathrag_chatbot = gr.Chatbot(
-                            type="messages",
-                            label="PathRag Chat History",
-                            show_copy_button = True,
-                            avatar_images=('./images/user.png','./images/bot.png'),
-                            height=int(screenheight*0.4),
-                            layout='bubble',
-                        )
-                    with gr.Row():
-                        # pathrag chunk sources
-                        pathrag_sources = gr.Textbox(
-                            label="PathRag sources",
-                            interactive=False,
-                            lines=15,
-                            max_lines=15,
-                            show_copy_button=True
-                        )
-                    with gr.Row():
-                        with gr.Column(scale=9):
-                            # prompt textbox
-                            pathrag_msg_input = gr.Textbox(
-                                elem_id="prompt",
-                                label="Your Question",
-                                show_label=False,
-                                placeholder="Type your question here and hit Enter...",
-                            )
-                        with gr.Column(scale=1):
-                            # clear conversation button
-                            pathrag_clear_btn = gr.ClearButton(components=[pathrag_msg_input, pathrag_chatbot, pathrag_sources],
-                                                    value="Clear conversation",
-                                                    variant="primary",
-                                                    scale=1)
-                with gr.Tab("LightRag"):
-                    with gr.Row():
-                        # lightrag chatbot component
-                        lightrag_chatbot = gr.Chatbot(
-                            type="messages",
-                            label="LightRag Chat History",
-                            show_copy_button = True,
-                            avatar_images=('./images/user.png','./images/bot.png'),
-                            height=int(screenheight*0.4),
-                            layout='bubble'
+                        pathrag_msg_input = gr.Textbox(show_label=False, placeholder="Ask a question about the active project...")
+                        pathrag_clear_btn = gr.ClearButton(
+                            components=[pathrag_msg_input, pathrag_chatbot, pathrag_sources],
+                            value="Clear conversation",
                         )
+
+                with gr.Tab("LightRAG"):
+                    lightrag_chatbot = gr.Chatbot(type="messages", label="LightRAG Chat History", height=420)
+                    lightrag_sources = gr.Textbox(label="LightRAG sources", interactive=False, lines=14)
                     with gr.Row():
-                        # lightrag chunk sources
-                        lightrag_sources = gr.Textbox(
-                            label="LightRag sources",
-                            interactive=False,
-                            lines=15,
-                            max_lines=15,
-                            show_copy_button=True
+                        lightrag_msg_input = gr.Textbox(show_label=False, placeholder="Ask a question about the active project...")
+                        lightrag_clear_btn = gr.ClearButton(
+                            components=[lightrag_msg_input, lightrag_chatbot, lightrag_sources],
+                            value="Clear conversation",
                         )
-                    with gr.Row():
-                        with gr.Column(scale=9):
-                            # prompt textbox
-                            lightrag_msg_input = gr.Textbox(
-                                elem_id="prompt",
-                                label="Your Question",
-                                show_label=False,
-                                placeholder="Type your question here and hit Enter...",
-                            )
-                        with gr.Column(scale=1):
-                            # clear conversation button
-                            lightrag_clear_btn = gr.ClearButton(components=[lightrag_msg_input, lightrag_chatbot, lightrag_sources],
-                                                    value="Clear conversation",
-                                                    variant="primary",
-                                                    scale=1)
-
-            # state = gr.State([])
-
 
         with gr.Tab("Knowledge Graph"):
             with gr.Row():
                 with gr.Column(scale=9):
                     graph_html = gr.HTML(render_graph_iframe(mygraph))
                 with gr.Column(scale=1):
-                    with gr.Row():
-                        # show legend
-                        legend_html = gr.HTML(generate_legend_html(mygraph))
-
+                    legend_html = gr.HTML(generate_legend_html(mygraph))
+            with gr.Row():
+                save_pickle_btn = gr.Button(value="Save Working Graph", variant="primary")
+                load_pickle_btn = gr.Button(value="Load Saved Graph")
             with gr.Tabs():
                 with gr.Tab("Edit Node"):
-                    with gr.Row():
-                        edit_node_dropdown = gr.Dropdown(choices=[], label="Select Node")
-                        edit_label = gr.Textbox(label="Label")
-                        edit_type = gr.Textbox(label="Type")
-                        edit_desc = gr.Textbox(label="Description")
-                        edit_source = gr.Textbox(label="Source")
-                        updatenode_btn = gr.Button(value="Update Node", variant="primary")
+                    edit_node_dropdown = gr.Dropdown(choices=[], label="Select Node")
+                    edit_label = gr.Textbox(label="Label")
+                    edit_type = gr.Textbox(label="Type")
+                    edit_desc = gr.Textbox(label="Description")
+                    edit_source = gr.Textbox(label="Source")
+                    updatenode_btn = gr.Button(value="Update Node", variant="primary")
                 with gr.Tab("Merge Nodes"):
-                    with gr.Row():
-                        m1 = gr.Dropdown(choices=[], label="Node 1")
-                        m2 = gr.Dropdown(choices=[], label="Node 2")
-                        mergenodes_btn = gr.Button(value="Merge Nodes", variant="primary")
-                with gr.Tab("Save Graph"):
-                    save_pickle_btn = gr.Button(value="💾 Save Pickle", variant="primary")
-
-    # --- Bindings ---
-    # sidebar components
-    # Go button click triggers ingestion process
-    go_btn.click(fn=handle_ingestion,
-                 inputs=[folder_path_input],
-                 outputs=[graph_html, status_messages] 
-                 ).then(fn=update_dropdowns,
-                        outputs=[m1, m2, edit_node_dropdown]
-                        )
-                        # then(fn=load_graph_from_pkl, inputs=[folder_path_input], outputs=[graph_html]). \
-    # submission of prompt triggers respons process
-    pathrag_msg_input.submit(fn=create_pathrag_response,
-                             inputs=[pathrag_msg_input, pathrag_chatbot],
-                             outputs=[pathrag_msg_input, pathrag_chatbot, pathrag_sources])
-    lightrag_msg_input.submit(fn=create_lightrag_response,
-                              inputs=[lightrag_msg_input, lightrag_chatbot],
-                              outputs=[lightrag_msg_input, lightrag_chatbot, lightrag_sources])
-    
-    # clear prompt and chat history
-    pathrag_clear_btn.click(fn=lambda: [None, None, None],
-                            inputs=[],
-                            outputs=[pathrag_msg_input, pathrag_chatbot, pathrag_sources],
-                            queue=False)
-    
-    lightrag_clear_btn.click(fn=lambda: [None, None, None],
-                             inputs=[],
-                             outputs=[lightrag_msg_input, lightrag_chatbot, lightrag_sources],
-                             queue=False)
-    
-    # update contents of node
-    updatenode_btn.click(fn=update_node_attributes,
-                         inputs=[edit_node_dropdown, edit_label, edit_type, edit_desc, edit_source],
-                         outputs=[graph_html, legend_html, status_messages])
-    # merge nodes
-    mergenodes_btn.click(fn=merge_nodes,
-                         inputs=[m1, m2],
-                         outputs=[graph_html, legend_html, status_messages])
-    # save graph
-    save_pickle_btn.click(fn=lambda: save_graph_pickle(),
-                          outputs=status_messages)
-
-    demo.load(fn=update_dropdowns,
-              outputs=[m1, m2, edit_node_dropdown])
-
-demo.launch(inbrowser=True, pwa=True)
+                    m1 = gr.Dropdown(choices=[], label="Node 1")
+                    m2 = gr.Dropdown(choices=[], label="Node 2")
+                    mergenodes_btn = gr.Button(value="Merge Nodes", variant="primary")
+
+    go_btn.click(
+        fn=handle_ingestion,
+        inputs=[folder_path_input],
+        outputs=[graph_html, legend_html, status_messages, active_folder, m1, m2, edit_node_dropdown],
+    )
+
+    save_pickle_btn.click(
+        fn=save_current_graph,
+        inputs=[active_folder],
+        outputs=[status_messages],
+    )
+
+    load_pickle_btn.click(
+        fn=load_saved_graph,
+        inputs=[active_folder],
+        outputs=[graph_html, legend_html, status_messages, m1, m2, edit_node_dropdown],
+    )
+
+    updatenode_btn.click(
+        fn=update_node_attributes,
+        inputs=[edit_node_dropdown, edit_label, edit_type, edit_desc, edit_source, active_folder],
+        outputs=[graph_html, legend_html, status_messages, m1, m2, edit_node_dropdown],
+    )
+
+    mergenodes_btn.click(
+        fn=merge_nodes,
+        inputs=[m1, m2, active_folder],
+        outputs=[graph_html, legend_html, status_messages, m1, m2, edit_node_dropdown],
+    )
+
+    pathrag_msg_input.submit(
+        fn=create_pathrag_response,
+        inputs=[pathrag_msg_input, pathrag_chatbot, active_folder],
+        outputs=[pathrag_msg_input, pathrag_chatbot, pathrag_sources],
+    )
+    lightrag_msg_input.submit(
+        fn=create_lightrag_response,
+        inputs=[lightrag_msg_input, lightrag_chatbot, active_folder],
+        outputs=[lightrag_msg_input, lightrag_chatbot, lightrag_sources],
+    )
+
+    pathrag_clear_btn.click(
+        fn=lambda: [None, None, None],
+        inputs=[],
+        outputs=[pathrag_msg_input, pathrag_chatbot, pathrag_sources],
+        queue=False,
+    )
+    lightrag_clear_btn.click(
+        fn=lambda: [None, None, None],
+        inputs=[],
+        outputs=[lightrag_msg_input, lightrag_chatbot, lightrag_sources],
+        queue=False,
+    )
+
+    demo.load(
+        fn=update_dropdowns,
+        outputs=[m1, m2, edit_node_dropdown],
+    )
+
+
+if __name__ == "__main__":
+    demo.queue()
+    demo.launch(inbrowser=True, pwa=True)
diff --git a/graph/chunker.py b/graph/chunker.py
index cfbde1e..ca10aa7 100644
--- a/graph/chunker.py
+++ b/graph/chunker.py
@@ -184,14 +184,14 @@ def chunk_parsed_pages(
         return chunks
 
     i = 0  # index of the next *new* sentence to place
-    prev_chunk_sentence_indices: List[int] = []
+    prev_new_sentence_indices: List[int] = []
 
     while i < len(sentences):
         # Determine overlap sentences (from tail of previous chunk) within overlap_chars
         overlap: List[Sentence] = []
-        if prev_chunk_sentence_indices and overlap_chars > 0:
+        if prev_new_sentence_indices and overlap_chars > 0:
             # Walk backwards over previous chunk's sentence indices, collect until we hit the char budget
-            tail = [sentences[k] for k in prev_chunk_sentence_indices]
+            tail = [sentences[k] for k in prev_new_sentence_indices]
             total = 0
             tmp: List[Sentence] = []
             for s in reversed(tail):
@@ -256,11 +256,12 @@ def chunk_parsed_pages(
             "included_new_sentence_count": len(new_sents),
             "include_overlap_in_limit": include_overlap_in_limit,
             "max_chars_target": max_chars,
+            "exceeds_target": len(text) > max_chars,
         }
         chunks.append(chunk)
 
         # Prepare for next iteration
-        prev_chunk_sentence_indices = chunk_sentence_indices
+        prev_new_sentence_indices = [s.idx for s in new_sents]
 
     return chunks
 
@@ -302,4 +303,4 @@ def chunk_text(
         print(f"[{ch['chunk_id']}] p{ch['start_page']}–p{ch['end_page']} ({ch['char_count']} chars)"
               f" | overlap={ch['overlap_chars_effective']}")
         print(ch["text"])
-        print("---")
\ No newline at end of file
+        print("---")
diff --git a/graph/db_storage.py b/graph/db_storage.py
index 637f0c3..3018dbf 100644
--- a/graph/db_storage.py
+++ b/graph/db_storage.py
@@ -1,9 +1,12 @@
 
 from __future__ import annotations
 import os
+import sys
 import sqlite3
 import json
 import hashlib
+import math
+import logging
 from typing import Dict, Any, List, Optional, Sequence, Tuple
 from contextlib import contextmanager
 from dataclasses import replace
@@ -11,6 +14,8 @@
 from llm import Embedder
 from settings import settings, StoragePaths as SettingsStoragePaths
 
+LOGGER = logging.getLogger("appl_kgraph.storage")
+
 # ---------------------------
 # Helpers
 # ---------------------------
@@ -355,6 +360,13 @@ def get_chunks_by_uuids(self, chunk_uuids: List[str]) -> List[Dict[str, Any]]:
             cur.execute(f"SELECT * FROM chunks WHERE chunk_uuid IN ({placeholders});", chunk_uuids)
             rows = cur.fetchall()
         return [dict(zip(self.KEYS, row)) for row in rows]
+
+    def list_chunks(self) -> List[Dict[str, Any]]:
+        with self.connect() as con:
+            cur = con.cursor()
+            cur.execute("SELECT * FROM chunks ORDER BY filename, chunk_id;")
+            rows = cur.fetchall()
+        return [dict(zip(self.KEYS, row)) for row in rows]
     
     def delete_chunks_by_doc_id(self, doc_id: str) -> None:
         with self.connect() as con:
@@ -495,6 +507,18 @@ def get_nodes(self, names: List[str]) -> List[Dict[str, Any]]:
             ''', tuple(names))
             rows = cur.fetchall()
             return [dict(zip(keys, row)) for row in rows] if rows else []
+
+    def list_nodes(self) -> List[Dict[str, Any]]:
+        keys = [k for k in self.KEYS_NODE if k != "id"]
+        with self.connect() as con:
+            cur = con.cursor()
+            cur.execute('''
+                SELECT name, type, description, source_id, filepath
+                FROM nodes
+                ORDER BY name;
+            ''')
+            rows = cur.fetchall()
+            return [dict(zip(keys, row)) for row in rows] if rows else []
     
 
     def get_nodes_by_chunk_uuids(self, chunk_uuids: List[str]) -> List[Dict[str, Any]]:
@@ -621,6 +645,18 @@ def get_edges(self, pairs: List[Tuple[str, str]]) -> List[Dict[str, Any]]:
             ''', tuple(flat_params))
             rows = cur.fetchall()
             return [dict(zip(keys, row)) for row in rows] if rows else []
+
+    def list_edges(self) -> List[Dict[str, Any]]:
+        keys = [k for k in self.KEYS_EDGE if k not in {"id", "u_source_name", "u_target_name"}]
+        with self.connect() as con:
+            cur = con.cursor()
+            cur.execute('''
+                SELECT source_name, target_name, weight, description, keywords, source_id, filepath
+                FROM edges
+                ORDER BY u_source_name, u_target_name;
+            ''')
+            rows = cur.fetchall()
+            return [dict(zip(keys, row)) for row in rows] if rows else []
         
     def get_edges_by_chunk_uuids(self, chunk_uuids: List[str]) -> List[Dict[str, Any]]:
         if not chunk_uuids:
@@ -932,6 +968,10 @@ def _edge_id(a: str, b: str) -> str:
         x, y = _normalize_pair(a, b)
         return f"{x}::{y}"
 
+    @staticmethod
+    def _embed_text(source_name: str, target_name: str, description: str, keywords: str) -> str:
+        return f"{source_name} <-> {target_name} :: {description} :: {keywords}"
+
     def add_relations(self, relations: Sequence[Dict[str, Any]]) -> None:
         ids: List[str] = []
         texts: List[str] = []
@@ -943,7 +983,7 @@ def add_relations(self, relations: Sequence[Dict[str, Any]]) -> None:
             kw = r.get("keywords", "") or ""
             ids.append(self._edge_id(src, tgt))
             # Embed src + tgt + description + keywords
-            texts.append(f"{src} <-> {tgt} :: {desc} :: {kw}")
+            texts.append(self._embed_text(src, tgt, desc, kw))
             metas.append({
                 "source_name": src,
                 "target_name": tgt,
@@ -974,7 +1014,7 @@ def upsert_relations(self, relations: Sequence[Dict[str, Any]]) -> None:
             kw = r.get("keywords", "") or ""
             ids.append(self._edge_id(src, tgt))
             # Embed src + tgt + description + keywords
-            texts.append(f"{src} <-> {tgt} :: {desc} :: {kw}")
+            texts.append(self._embed_text(src, tgt, desc, kw))
             metas.append({
                 "source_name": src,
                 "target_name": tgt,
@@ -1027,6 +1067,12 @@ def __init__(self, paths: Optional[StoragePaths] = None, embedder: Optional[Embe
         self.chunk_vectors = ChunkVectors(collection="chunks", chroma_dir=paths.chroma_chunks, embedder=self.embedder)
         self.entity_vectors = EntityVectors(collection="entities", chroma_dir=paths.chroma_entities, embedder=self.embedder)
         self.relation_vectors = RelationVectors(collection="relations", chroma_dir=paths.chroma_relations, embedder=self.embedder)
+        self._chunk_search_cache: Optional[List[Tuple[Dict[str, Any], List[float]]]] = None
+        self._entity_search_cache: Optional[List[Tuple[Dict[str, Any], List[float]]]] = None
+        self._relation_search_cache: Optional[List[Tuple[Dict[str, Any], List[float]]]] = None
+        self._prefer_python_vector_search = self._should_use_python_vector_search()
+        self._vector_mutations_disabled = self._prefer_python_vector_search
+        self._vector_warning_emitted = False
 
     def init(self):
         """Create tables/collections if they don't exist yet."""
@@ -1034,6 +1080,206 @@ def init(self):
         self.chunksdb.init()
         self.graphdb.init()
 
+    def _invalidate_chunk_cache(self) -> None:
+        self._chunk_search_cache = None
+
+    def _invalidate_entity_cache(self) -> None:
+        self._entity_search_cache = None
+
+    def _invalidate_relation_cache(self) -> None:
+        self._relation_search_cache = None
+
+    @staticmethod
+    def _cosine_similarity(a: Sequence[float], b: Sequence[float]) -> float:
+        if not a or not b or len(a) != len(b):
+            return 0.0
+        dot = 0.0
+        norm_a = 0.0
+        norm_b = 0.0
+        for x, y in zip(a, b):
+            dot += float(x) * float(y)
+            norm_a += float(x) * float(x)
+            norm_b += float(y) * float(y)
+        if norm_a <= 0.0 or norm_b <= 0.0:
+            return 0.0
+        return max(0.0, dot / (math.sqrt(norm_a) * math.sqrt(norm_b)))
+
+    @staticmethod
+    def _env_flag(name: str) -> Optional[bool]:
+        raw = os.getenv(name, "").strip().lower()
+        if not raw:
+            return None
+        if raw in {"1", "true", "yes", "on"}:
+            return True
+        if raw in {"0", "false", "no", "off"}:
+            return False
+        return None
+
+    def _should_use_python_vector_search(self) -> bool:
+        force_python = self._env_flag("APPL_KGRAPH_FORCE_PYTHON_VECTOR_SEARCH")
+        if force_python is not None:
+            return force_python
+        force_chroma = self._env_flag("APPL_KGRAPH_FORCE_CHROMA_QUERY")
+        if force_chroma is not None:
+            return not force_chroma
+        return os.name == "nt" and sys.version_info >= (3, 12)
+
+    def _warn_vector_backend_disabled(self) -> None:
+        if self._vector_mutations_disabled and not self._vector_warning_emitted:
+            LOGGER.warning(
+                "Native Chroma vector mutations are disabled on this environment; "
+                "falling back to SQL-backed similarity search."
+            )
+            self._vector_warning_emitted = True
+
+    @staticmethod
+    def _flatten_query_values(values: Any) -> List[Any]:
+        if values is None:
+            return []
+        if isinstance(values, list) and values and isinstance(values[0], list):
+            return list(values[0])
+        if isinstance(values, list):
+            return list(values)
+        return [values]
+
+    @staticmethod
+    def _distance_to_similarity(value: Any) -> float:
+        try:
+            return max(0.0, 1.0 - float(value))
+        except (TypeError, ValueError):
+            return 0.0
+
+    def _native_chunk_search(self, text: str, n_results: int) -> List[Dict[str, Any]]:
+        rows = self.chunk_vectors.query(text=text, n_results=n_results) or []
+        matches: List[Dict[str, Any]] = []
+        for row in rows:
+            ids = self._flatten_query_values(row.get("ids"))
+            documents = self._flatten_query_values(row.get("documents"))
+            metadatas = self._flatten_query_values(row.get("metadatas"))
+            distances = self._flatten_query_values(row.get("distances"))
+            size = max(len(ids), len(documents), len(metadatas), len(distances))
+            for index in range(size):
+                metadata = metadatas[index] if index < len(metadatas) and isinstance(metadatas[index], dict) else {}
+                matches.append({
+                    "chunk_uuid": ids[index] if index < len(ids) else "",
+                    "doc_id": metadata.get("doc_id", ""),
+                    "filename": metadata.get("filename", ""),
+                    "text": documents[index] if index < len(documents) else "",
+                    "score": self._distance_to_similarity(distances[index] if index < len(distances) else None),
+                })
+        return matches[:n_results]
+
+    def _native_entity_search(self, text: str, n_results: int) -> List[Dict[str, Any]]:
+        rows = self.entity_vectors.query(text=text, n_results=n_results) or []
+        matches: List[Dict[str, Any]] = []
+        for row in rows:
+            metadatas = self._flatten_query_values(row.get("metadatas"))
+            distances = self._flatten_query_values(row.get("distances"))
+            for index, metadata in enumerate(metadatas):
+                if not isinstance(metadata, dict):
+                    continue
+                matches.append({
+                    "name": metadata.get("name", ""),
+                    "type": metadata.get("type", ""),
+                    "description": metadata.get("description", ""),
+                    "source_id": metadata.get("source_id", ""),
+                    "filepath": metadata.get("filepath", ""),
+                    "score": self._distance_to_similarity(distances[index] if index < len(distances) else None),
+                })
+        return matches[:n_results]
+
+    def _native_relation_search(self, text: str, n_results: int) -> List[Dict[str, Any]]:
+        rows = self.relation_vectors.query(text=text, n_results=n_results) or []
+        matches: List[Dict[str, Any]] = []
+        for row in rows:
+            metadatas = self._flatten_query_values(row.get("metadatas"))
+            distances = self._flatten_query_values(row.get("distances"))
+            for index, metadata in enumerate(metadatas):
+                if not isinstance(metadata, dict):
+                    continue
+                matches.append({
+                    "source_name": metadata.get("source_name", ""),
+                    "target_name": metadata.get("target_name", ""),
+                    "description": metadata.get("description", ""),
+                    "keywords": metadata.get("keywords", ""),
+                    "weight": metadata.get("weight", 0),
+                    "source_id": metadata.get("source_id", ""),
+                    "filepath": metadata.get("filepath", ""),
+                    "score": self._distance_to_similarity(distances[index] if index < len(distances) else None),
+                })
+        return matches[:n_results]
+
+    def _rank_records(
+        self,
+        *,
+        query_text: str,
+        cache: List[Tuple[Dict[str, Any], List[float]]],
+        n_results: int,
+    ) -> List[Dict[str, Any]]:
+        if not query_text or not cache or n_results <= 0:
+            return []
+        query_embedding = self.embedder.embed_texts([query_text])[0]
+        scored: List[Tuple[float, Dict[str, Any]]] = []
+        for record, embedding in cache:
+            score = self._cosine_similarity(query_embedding, embedding)
+            scored.append((score, record))
+        scored.sort(key=lambda item: item[0], reverse=True)
+        return [{**record, "score": score} for score, record in scored[:n_results]]
+
+    def search_chunks(self, text: str, n_results: int = 5) -> List[Dict[str, Any]]:
+        if not self._prefer_python_vector_search:
+            try:
+                return self._native_chunk_search(text=text, n_results=n_results)
+            except Exception:
+                pass
+        if self._chunk_search_cache is None:
+            chunks = self.chunksdb.list_chunks()
+            texts = [chunk.get("text", "") or "" for chunk in chunks]
+            embeddings = self.embedder.embed_texts(texts) if texts else []
+            self._chunk_search_cache = list(zip(chunks, embeddings))
+        return self._rank_records(query_text=text, cache=self._chunk_search_cache, n_results=n_results)
+
+    def search_entities(self, text: str, n_results: int = 5) -> List[Dict[str, Any]]:
+        if not self._prefer_python_vector_search:
+            try:
+                return self._native_entity_search(text=text, n_results=n_results)
+            except Exception:
+                pass
+        if self._entity_search_cache is None:
+            entities = self.graphdb.list_nodes()
+            texts = [
+                EntityVectors._embed_text(
+                    entity.get("name", ""),
+                    entity.get("type", "") or "",
+                    entity.get("description", "") or "",
+                )
+                for entity in entities
+            ]
+            embeddings = self.embedder.embed_texts(texts) if texts else []
+            self._entity_search_cache = list(zip(entities, embeddings))
+        return self._rank_records(query_text=text, cache=self._entity_search_cache, n_results=n_results)
+
+    def search_relations(self, text: str, n_results: int = 5) -> List[Dict[str, Any]]:
+        if not self._prefer_python_vector_search:
+            try:
+                return self._native_relation_search(text=text, n_results=n_results)
+            except Exception:
+                pass
+        if self._relation_search_cache is None:
+            relations = self.graphdb.list_edges()
+            texts = [
+                RelationVectors._embed_text(
+                    relation.get("source_name", ""),
+                    relation.get("target_name", ""),
+                    relation.get("description", "") or "",
+                    relation.get("keywords", "") or "",
+                )
+                for relation in relations
+            ]
+            embeddings = self.embedder.embed_texts(texts) if texts else []
+            self._relation_search_cache = list(zip(relations, embeddings))
+        return self._rank_records(query_text=text, cache=self._relation_search_cache, n_results=n_results)
+
     def get_llm_cache(self, model: str, prompt_sha: str, text_sha: str, max_age_hours: int) -> Optional[str]:
         return self.documentsdb.get_llm_cache(model, prompt_sha, text_sha, max_age_hours)
 
@@ -1049,17 +1295,23 @@ def add_document(self, metadata: Dict[str, Any], full_text: str) -> None:
     # 2) Chunks schema
     def add_chunks(self, chunks: Sequence[Dict[str, Any]]) -> None:
         self.chunksdb.add_chunks(chunks)
+        self._invalidate_chunk_cache()
 
     # 3) Knowledge Graph schema
     def add_kg_nodes(self, nodes: List[Dict[str, Any]]) -> None:
         self.graphdb.add_nodes(nodes)
+        self._invalidate_entity_cache()
 
     def add_kg_edges(self, edges: List[Dict[str, Any]]) -> None:
         self.graphdb.add_edges(edges)
+        self._invalidate_relation_cache()
 
     # 4) Chunk vectors
     def add_chunk_vectors(self, chunks: Sequence[Dict[str, Any]]) -> None:
         if chunks:
+            if self._vector_mutations_disabled:
+                self._warn_vector_backend_disabled()
+                return
             self.chunk_vectors.add_chunks(chunks)
 
     # 5) Entity vectors
@@ -1069,6 +1321,9 @@ def add_entity_vectors(self, entities: Sequence[Dict[str, Any]]) -> None:
         Uniqueness enforced via entity name.
         """
         if entities:
+            if self._vector_mutations_disabled:
+                self._warn_vector_backend_disabled()
+                return
             self.entity_vectors.add_entities(entities)
 
     # 6) Relation vectors
@@ -1079,6 +1334,9 @@ def add_relation_vectors(self, relations: Sequence[Dict[str, Any]]) -> None:
         Uniqueness enforced via normalized ID "min::max".
         """
         if relations:
+            if self._vector_mutations_disabled:
+                self._warn_vector_backend_disabled()
+                return
             self.relation_vectors.add_relations(relations)
 
     # ---------- Get-only APIs ----------
@@ -1134,6 +1392,8 @@ def get_edges_by_chunk_uuids(self, chunk_uuids: List[str]) -> List[Dict[str, Any
 
     # 4) Chunk Vectors
     def get_chunk_vector(self, chunk_uuid: str) -> Optional[Dict[str, Any]]:
+        if self._vector_mutations_disabled:
+            return None
         return self.chunk_vectors.get([chunk_uuid])
 
     # 5) Entity Vectors
@@ -1144,11 +1404,15 @@ def get_entities(self, names: List[str]) -> List[Dict[str, Any]]:
         """
         if not names:
             return []
+        if self._vector_mutations_disabled:
+            return []
         return self.entity_vectors.get_entities(names)
 
 
     # 6) Relation Vectors
     def get_relations(self, pairs: List[Tuple[str, str]]) -> Optional[Dict[str, Any]]:
+        if self._vector_mutations_disabled:
+            return []
         return self.relation_vectors.get_relations(pairs)
 
     # ---------- Delete-only APIs ----------
@@ -1160,12 +1424,15 @@ def delete_document(self, doc_id: str) -> None:
     # 2) Chunks
     def delete_chunks_by_doc_id(self, doc_id: str) -> None:
         self.chunksdb.delete_chunks_by_doc_id(doc_id)
+        self._invalidate_chunk_cache()
 
     def delete_chunk_by_uuid(self, chunk_uuid: str) -> None:
         self.chunksdb.delete_chunk_by_uuid(chunk_uuid)
+        self._invalidate_chunk_cache()
 
     def delete_chunks_by_uuids(self, chunk_uuids: List[str]) -> None:
         self.chunksdb.delete_chunks_by_uuids(chunk_uuids)
+        self._invalidate_chunk_cache()
 
     # 3) Graph
     def delete_node(self, name: str) -> None:
@@ -1173,21 +1440,27 @@ def delete_node(self, name: str) -> None:
         Pass-through: delete one graph node by name.
         """
         self.graphdb.delete_node(name)
+        self._invalidate_entity_cache()
 
     def delete_nodes(self, names: List[str]) -> None:
         """
         Pass-through: bulk delete graph nodes by name.
         """
         self.graphdb.delete_nodes(names)
+        self._invalidate_entity_cache()
 
     def delete_edge(self, source_name: str, target_name: str) -> None:
         self.graphdb.delete_edge(source_name, target_name)
+        self._invalidate_relation_cache()
 
     def delete_edges(self, pairs: List[Tuple[str, str]]) -> None:
         self.graphdb.delete_edges(pairs)
+        self._invalidate_relation_cache()
 
     # 4) Chunk Vectors
     def delete_chunk_vector(self, chunk_uuid: str) -> None:
+        if self._vector_mutations_disabled:
+            return
         self.chunk_vectors.delete([chunk_uuid])
 
     # 5) Entity Vectors
@@ -1197,10 +1470,14 @@ def delete_entity_vector(self, names: List[str]) -> None:
         """
         if not names:
             return
+        if self._vector_mutations_disabled:
+            return
         self.entity_vectors.delete_entities(names)
 
     # 6) Relation Vectors
     def delete_relation_vector(self, pairs: List[Tuple[str, str]]) -> None:
+        if self._vector_mutations_disabled:
+            return
         self.relation_vectors.delete_relations(pairs)
 
     # ---------- Update and Upsert APIs ----------
@@ -1212,6 +1489,7 @@ def upsert_document(self, doc_id: str, updates: Dict[str, Any]) -> None:
     # 2) Chunks
     def upsert_chunk(self, chunk_uuid: str, updates: Dict[str, Any]) -> None:
         self.chunksdb.update_chunk(chunk_uuid, updates)
+        self._invalidate_chunk_cache()
 
     # 3) Graph
     def upsert_node(self, name: str, updates: Dict[str, Any]) -> None:
@@ -1247,6 +1525,7 @@ def upsert_nodes(self, updates_list: List[Dict[str, Any]]) -> None:
             self.graphdb.add_nodes(to_add)
         if to_update:
             self.graphdb.update_nodes(to_update)
+        self._invalidate_entity_cache()
 
     def upsert_edge(self, source_name: str, target_name: str, updates: Dict[str, Any]) -> None:
         self.graphdb.update_edge(source_name, target_name, updates)
@@ -1264,15 +1543,31 @@ def upsert_edges(self, updates_list: List[Dict[str, Any]]) -> None:
             self.graphdb.add_edges(to_add)
         if to_update:
             self.graphdb.update_edges(to_update)
+        self._invalidate_relation_cache()
 
     # 4) Chunk Vectors
     def upsert_chunk_vector(self, chunks: Sequence[Dict[str, Any]]) -> None:
+        if self._vector_mutations_disabled:
+            self._warn_vector_backend_disabled()
+            self._invalidate_chunk_cache()
+            return
         self.chunk_vectors.upsert(chunks)
+        self._invalidate_chunk_cache()
 
     # 5) Entity Vectors
     def upsert_entity_vector(self, entities: List[Dict[str, Any]]) -> None:
+        if self._vector_mutations_disabled:
+            self._warn_vector_backend_disabled()
+            self._invalidate_entity_cache()
+            return
         self.entity_vectors.upsert_entities(entities)
+        self._invalidate_entity_cache()
 
     # 6) Relation Vectors
     def upsert_relation_vector(self, relations: List[Dict[str, Any]]) -> None:
-        self.relation_vectors.upsert_relations(relations)
\ No newline at end of file
+        if self._vector_mutations_disabled:
+            self._warn_vector_backend_disabled()
+            self._invalidate_relation_cache()
+            return
+        self.relation_vectors.upsert_relations(relations)
+        self._invalidate_relation_cache()
diff --git a/graph/extractor.py b/graph/extractor.py
index 0dff61a..c974c27 100644
--- a/graph/extractor.py
+++ b/graph/extractor.py
@@ -1,80 +1,60 @@
 from __future__ import annotations
 
-import re
+import hashlib
 import json
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
-from prompts import PROMPTS
-from llm import Chat
 from typing import Any, Dict, Iterable, List, Optional, Tuple
-from settings import settings
-from concurrent.futures import ThreadPoolExecutor, as_completed
+
 from db_storage import Storage
-import hashlib
+from llm import Chat
+from prompts import PROMPTS
+from settings import settings
+from utils import detect_language, normalize_language_name
 
-# ─────────────────────────────────────────────────────────────
-# Helpers
-# ─────────────────────────────────────────────────────────────
 
 def _sha256(s: str) -> str:
-    """
-    Computes SHA-256 hash of a string for caching keys.
+    return hashlib.sha256(s.encode("utf-8")).hexdigest()
 
-    Args:
-        s (str): The input string to hash.
 
-    Returns:
-        str: The hexadecimal representation of the SHA-256 hash.
-    """
-    return hashlib.sha256(s.encode("utf-8")).hexdigest()
+def _default_entity_types() -> List[str]:
+    return list(settings.prompts.default_entity_types or PROMPTS["DEFAULT_ENTITY_TYPES"])
 
-# ─────────────────────────────────────────────────────────────
-# Prompt Builder
-# ─────────────────────────────────────────────────────────────
 
 def build_entity_relation_prompt(
     text: str,
     language: Optional[str] = None,
     entity_types: Optional[Iterable[str]] = None,
 ) -> str:
-    """
-    Builds a prompt for entity and relationship extraction from text.
-
-    Fills the entity extraction prompt template with appropriate delimiters, language,
-    examples, and entity types. Formats examples to replace placeholder literals.
-
-    Args:
-        text (str): The input text chunk to extract entities and relationships from.
-        language (Optional[str], optional): The output language for extraction.
-            Defaults to the configured default language.
-        entity_types (Optional[Iterable[str]], optional): The types of entities to extract.
-            Defaults to the configured default entity types.
-
-    Returns:
-        str: A formatted prompt ready for LLM consumption.
-    """
-    # 1) Base context for the prompt (without examples yet)
     examples_template = "\n\n".join(PROMPTS.get("entity_extraction_examples", []))
     ctx = dict(
-        tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
-        record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
-        completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
-        entity_types=", ".join(entity_types) if entity_types else ", ".join(PROMPTS["DEFAULT_ENTITY_TYPES"]),
+        tuple_delimiter=settings.prompts.tuple_delimiter,
+        record_delimiter=settings.prompts.record_delimiter,
+        completion_delimiter=settings.prompts.completion_delimiter,
+        entity_types=", ".join(entity_types or _default_entity_types()),
         examples="",
-        language=language or PROMPTS["DEFAULT_LANGUAGE"],
+        language=normalize_language_name(language, settings.prompts.default_language),
         input_text=text,
     )
+    ctx["examples"] = examples_template.format(**ctx)
+    return PROMPTS["entity_extraction"].format(**ctx)
 
-    # 2) Join & format the examples with the SAME ctx so placeholders are replaced
-    examples = examples_template.format(**ctx)  # fill in delimiters, entity_types, language
-    ctx["examples"] = examples
 
-    # 3) Finally format the main template (including the filled examples)
-    template = PROMPTS["entity_extraction"]
-    return template.format(**ctx)
+def build_entity_audit_prompt(
+    text: str,
+    *,
+    initial_extraction: str,
+    language: Optional[str] = None,
+    entity_types: Optional[Iterable[str]] = None,
+) -> str:
+    return PROMPTS["entity_extraction_audit"].format(
+        input_text=text,
+        initial_extraction=initial_extraction,
+        language=normalize_language_name(language, settings.prompts.default_language),
+        entity_types=", ".join(entity_types or _default_entity_types()),
+    )
 
-# ─────────────────────────────────────────────────────────────
-# Parsing utilities (regex + delimiter tolerant)
-# ─────────────────────────────────────────────────────────────
 
 @dataclass
 class ParsedOutput:
@@ -85,35 +65,21 @@ class ParsedOutput:
 
 
 _FANCY_QUOTES = {
-    "“": '"', "”": '"', "„": '"',
-    "‘": "'", "’": "'",
+    "“": '"',
+    "”": '"',
+    "„": '"',
+    "‘": "'",
+    "’": "'",
 }
 
-def _normalize_quotes(s: str) -> str:
-    """
-    Normalizes fancy/smart quotes to standard ASCII quotes.
-
-    Args:
-        s (str): The input string with potential fancy quotes.
 
-    Returns:
-        str: The string with all fancy quotes replaced by standard quotes.
-    """
-    for k, v in _FANCY_QUOTES.items():
-        s = s.replace(k, v)
+def _normalize_quotes(s: str) -> str:
+    for key, value in _FANCY_QUOTES.items():
+        s = s.replace(key, value)
     return s
 
 
 def _strip_parens(s: str) -> str:
-    """
-    Removes surrounding parentheses from a string if present.
-
-    Args:
-        s (str): The input string.
-
-    Returns:
-        str: The string with outer parentheses removed, or the original string if no parentheses.
-    """
     s = s.strip()
     if s.startswith("(") and s.endswith(")"):
         return s[1:-1].strip()
@@ -121,38 +87,18 @@ def _strip_parens(s: str) -> str:
 
 
 def _strip_quotes(s: str) -> str:
-    """
-    Removes surrounding quotes from a string if present.
-
-    Args:
-        s (str): The input string.
-
-    Returns:
-        str: The string with outer quotes removed, or the original string if no quotes.
-    """
-    s = s.strip()
-    s = _normalize_quotes(s)
+    s = _normalize_quotes(s.strip())
     if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
         return s[1:-1]
     return s
 
 
 def _to_float_or_none(x: str) -> Optional[float]:
-    """
-    Extracts and converts a floating-point number from a string.
-
-    Args:
-        x (str): The input string containing a number.
-
-    Returns:
-        Optional[float]: The extracted float value, or None if no valid number is found.
-    """
-    x = x.strip()
-    m = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", x)
-    if not m:
+    match = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", x.strip())
+    if not match:
         return None
     try:
-        return float(m.group(0))
+        return float(match.group(0))
     except Exception:
         return None
 
@@ -163,108 +109,65 @@ def parse_model_output(
     record_delim: Optional[str] = None,
     completion_delim: Optional[str] = None,
 ) -> ParsedOutput:
-    """
-    Parse the LLM output into entities and relationships using regex/splits.
-
-    Expected generated record shapes (from your prompt):
-      ("entity"<|>"name"<|>"type"<|>"description")
-      ("relationship"<|>"source"<|>"target"<|>"description"<|>"keywords"<|>"strength")
-      ("content_keywords"<|>"kw1, kw2, ...")
-
-    Tolerates both real delimiters and accidental literal
-    '{record_delimiter}' / '{tuple_delimiter}' tokens from the model.
-    """
-    tuple_delim = tuple_delim or PROMPTS["DEFAULT_TUPLE_DELIMITER"]
-    record_delim = record_delim or PROMPTS["DEFAULT_RECORD_DELIMITER"]
-    completion_delim = completion_delim or PROMPTS["DEFAULT_COMPLETION_DELIMITER"]
+    tuple_delim = tuple_delim or settings.prompts.tuple_delimiter
+    record_delim = record_delim or settings.prompts.record_delimiter
+    completion_delim = completion_delim or settings.prompts.completion_delimiter
 
     raw = _normalize_quotes(raw)
+    raw = raw.replace("{tuple_delimiter}", tuple_delim)
+    raw = raw.replace("{record_delimiter}", record_delim)
+    raw = raw.replace("{completion_delimiter}", completion_delim)
 
-    # Defensive: handles examples that slipped through or model echoes
-    raw = raw.replace("{tuple_delimiter}", tuple_delim) \
-             .replace("{record_delimiter}", record_delim) \
-             .replace("{completion_delimiter}", completion_delim)
-    
-
-    # Truncate at completion delimiter if present
     if completion_delim in raw:
         raw = raw.split(completion_delim, 1)[0]
-
-    # Some models may echo headings, keep only after "Output:" if present
     if "Output:" in raw:
         raw = raw.split("Output:", 1)[1]
 
-    # Split into records by record delimiter (tolerate trailing spaces/newlines)
-    recs = re.split(rf"{re.escape(record_delim)}\s*", raw)
-    recs = [r.strip() for r in recs if r.strip()]
+    records = re.split(rf"{re.escape(record_delim)}\s*", raw)
+    records = [record.strip() for record in records if record.strip()]
+
     entities: List[Dict[str, Any]] = []
     relationships: List[Dict[str, Any]] = []
     content_keywords: List[str] = []
 
-    for rec in recs:
-        body = _strip_parens(rec)
-        # Split fields by tuple delimiter
-        parts = [p.strip() for p in body.split(tuple_delim)]
-        parts = [_strip_quotes(p) for p in parts if p.strip() != ""]
-
+    for record in records:
+        body = _strip_parens(record)
+        parts = [_strip_quotes(part.strip()) for part in body.split(tuple_delim) if part.strip()]
         if not parts:
             continue
-        tag = parts[0].lower()
 
+        tag = parts[0].lower()
         if tag == "entity" and len(parts) >= 4:
-            # ("entity", name, type, description)
-            name, etype, desc = parts[1], parts[2], parts[3]
-            entities.append({
-                "name": name,
-                "type": etype,
-                "description": desc,
-            })
-
+            entities.append(
+                {
+                    "name": parts[1],
+                    "type": parts[2],
+                    "description": parts[3],
+                }
+            )
         elif tag == "relationship" and len(parts) >= 6:
-            # Supported formats:
-            # 6-tuple: ("relationship", source_name, target_name, description, keywords, strength)
             _, src, tgt, desc, keywords, strength = parts[:6]
-            relationships.append({
-                "source_name": src,
-                "target_name": tgt,
-                "description": desc,
-                "keywords": keywords,
-                "weight": _to_float_or_none(strength),
-            })
-
+            relationships.append(
+                {
+                    "source_name": src,
+                    "target_name": tgt,
+                    "description": desc,
+                    "keywords": keywords,
+                    "weight": _to_float_or_none(strength),
+                }
+            )
         elif tag == "content_keywords" and len(parts) >= 2:
-            # ("content_keywords", "kw1, kw2, ...")
-            kws = [k.strip() for k in parts[1].split(",") if k.strip()]
-            content_keywords.extend(kws)
-
-        else:
-            # Unknown tag—keep raw for debugging (raw_records contains it)
-            pass
+            content_keywords.extend([part.strip() for part in parts[1].split(",") if part.strip()])
 
     return ParsedOutput(
         entities=entities,
         relationships=relationships,
         content_keywords=content_keywords,
-        raw_records=recs,
+        raw_records=records,
     )
 
-# ─────────────────────────────────────────────────────────────
-# Public extraction API (chunk-by-chunk + batch)
-# ─────────────────────────────────────────────────────────────
 
 def _get_chunk_text(chunk: Dict[str, Any]) -> str:
-    """
-    Extracts text content from a chunk dictionary.
-
-    Args:
-        chunk (Dict[str, Any]): A chunk dictionary that may contain text under various keys.
-
-    Returns:
-        str: The text content of the chunk.
-
-    Raises:
-        KeyError: If the chunk doesn't contain text under expected keys.
-    """
     for key in ("text", "content", "body"):
         if key in chunk and isinstance(chunk[key], str):
             return chunk[key]
@@ -272,220 +175,300 @@ def _get_chunk_text(chunk: Dict[str, Any]) -> str:
 
 
 def _require_chunk_uuid(chunk: Dict[str, Any]) -> str:
-    """
-    Validates and extracts the chunk_uuid from a chunk dictionary.
-
-    Args:
-        chunk (Dict[str, Any]): A chunk dictionary.
-
-    Returns:
-        str: The chunk_uuid as a string.
-
-    Raises:
-        KeyError: If chunk_uuid is missing or empty.
-    """
     if "chunk_uuid" not in chunk or not chunk["chunk_uuid"]:
         raise KeyError("Each chunk MUST include 'chunk_uuid' (used as source_id).")
     return str(chunk["chunk_uuid"])
 
-def extract_from_chunks(
-    chunks: Iterable[Dict[str, Any]],
-    language: Optional[str] = None,
-    entity_types: Optional[Iterable[str]] = None,
-    client: Optional[Chat] = None,
-) -> Dict[str, Any]:
-    """
-    High-level convenience: iterate chunks, call LLM, parse, and return collected results.
-    Returns dict with 'entities', 'relationships', 'content_keywords'.
-
-    PERFORMANCE ENHANCEMENTS:
-    - Reuses a single Chat client (singleton) to keep HTTP sessions hot.
-    - Adds a content-addressed LLM cache in SQLite (model + prompt_sha + text_sha).
-      *Cache stores RAW model text; on hits we parse it exactly like fresh outputs.*
-    - Calls the LLM ONLY for cache misses; hits are stitched back in order.
-    - Runs LLM calls for misses concurrently (bounded by settings-based concurrency).
-
-    SETTINGS (from settings.py):
-    - Concurrency:  settings.perf.max_concurrency       (fallback 6 if missing)
-    - Cache on/off: settings.perf.cache_enabled         (fallback True if missing)
-    - Cache TTL:    settings.perf.cache_max_age_hours   (fallback 720 if missing)
-
-    Provenance:
-    - Even on cache hits, each extracted record is stamped with this file/chunk's
-      source identifiers so cross-file queries remain accurate.
-
-    NOTES:
-    - Prompts are built with `build_entity_relation_prompt(...)`.
-    - Parsing uses `parse_model_output(...)`.
-    """
-    # Pull settings
-    MAX_WORKERS = settings.llmperf.max_concurrency
-    CACHE_ENABLED = settings.llmperf.cache_enabled
-    CACHE_MAX_AGE_HOURS = settings.llmperf.cache_max_age_hours
-
-    # 1) Shared LLM client + create Storage facade that connects to SQLite & vector DB
-    chat = client or Chat.singleton()
-    storage: Optional[Storage]
-    try:
-        storage = Storage()
-    except:
-        raise
-    else:
-        storage.init()
-
-    # 2) Materialize chunks to keep stable indexing for stitching results
-    chunk_list: List[Dict[str, Any]] = list(chunks)
-
-    # 3) Build the exact prompts for each chunk
-    #    and compute cache keys (prompt_sha, text_sha) per chunk.
-    prompts: List[str] = []
-    keys: List[Tuple[str, str]] = []  # (prompt_sha, text_sha)
-    for ch in chunk_list:
-        txt = _get_chunk_text(ch)
-        prompt = build_entity_relation_prompt(
-            text=txt,
-            language=language,
-            entity_types=entity_types,
-        )
-        prompts.append(prompt)
-        keys.append((_sha256(prompt), _sha256(txt)))
 
-    # 4) Probe cache; mark misses
+def _resolve_chunk_language(
+    chunk: Dict[str, Any],
+    *,
+    explicit_language: Optional[str] = None,
+) -> str:
+    if explicit_language:
+        return normalize_language_name(explicit_language, settings.prompts.default_language)
+
+    default_language = settings.prompts.default_language
+    if settings.extraction.use_chunk_language:
+        for key in ("chunk_language", "language", "document_language"):
+            if chunk.get(key):
+                return normalize_language_name(str(chunk.get(key)), default_language)
+        if settings.extraction.detect_chunk_language:
+            detected = detect_language(_get_chunk_text(chunk))
+            if detected and detected != "unknown":
+                return normalize_language_name(detected, default_language)
+    elif chunk.get("document_language"):
+        return normalize_language_name(str(chunk.get("document_language")), default_language)
+
+    return normalize_language_name(None, default_language)
+
+
+def _ensure_storage(storage: Optional[Storage]) -> Storage:
+    active_storage = storage or Storage()
+    active_storage.init()
+    return active_storage
+
+
+def _run_cached_prompts(
+    *,
+    chat: Chat,
+    storage: Storage,
+    prompts: List[str],
+    text_hash_inputs: List[str],
+    system_prompt: str,
+) -> List[str]:
+    max_workers = settings.llmperf.max_concurrency
+    cache_enabled = settings.llmperf.cache_enabled
+    cache_max_age_hours = settings.llmperf.cache_max_age_hours
     model_name = chat.model
-    raw_outputs: List[Optional[str]] = [None] * len(chunk_list)
+
+    keys = [(_sha256(prompt), _sha256(text_hash)) for prompt, text_hash in zip(prompts, text_hash_inputs)]
+    raw_outputs: List[Optional[str]] = [None] * len(prompts)
     to_run: List[int] = []
 
-    if CACHE_ENABLED:
-        for i, (psha, tsha) in enumerate(keys):
-            cached = storage.get_llm_cache(model_name, psha, tsha, CACHE_MAX_AGE_HOURS)
-            if cached is not None:
-                raw_outputs[i] = cached
+    if cache_enabled:
+        for index, (prompt_sha, text_sha) in enumerate(keys):
+            cached = storage.get_llm_cache(model_name, prompt_sha, text_sha, cache_max_age_hours)
+            if cached is None:
+                to_run.append(index)
             else:
-                to_run.append(i)
+                raw_outputs[index] = cached
     else:
-        to_run = list(range(len(chunk_list)))
+        to_run = list(range(len(prompts)))
 
-    # 5) Call the LLM ONLY for cache misses, in parallel (bounded)
-    def _call_one(i: int) -> str:
-        return chat.generate(prompt=prompts[i], system="You extract entities and relationships precisely in the required format. Do not add commentary.")
+    def _call_one(index: int) -> str:
+        return chat.generate(prompt=prompts[index], system=system_prompt)
 
     if to_run:
-        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
-            futs = {ex.submit(_call_one, i): i for i in to_run}
-            for fut in as_completed(futs):
-                i = futs[fut]
-                out = fut.result()
-                raw_outputs[i] = out
-                if CACHE_ENABLED and storage is not None:
-                    psha, tsha = keys[i]
-                    storage.put_llm_cache(model_name, psha, tsha, out)
-
-    # 6) Parse outputs and attach per-chunk provenance (identical to your approach)
-    all_entities: List[Dict[str, Any]] = []
-    all_relationships: List[Dict[str, Any]] = []
-    all_keywords: List[str] = []
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = {executor.submit(_call_one, index): index for index in to_run}
+            for future in as_completed(futures):
+                index = futures[future]
+                output = future.result()
+                raw_outputs[index] = output
+                if cache_enabled:
+                    prompt_sha, text_sha = keys[index]
+                    storage.put_llm_cache(model_name, prompt_sha, text_sha, output)
+
+    return [output or "" for output in raw_outputs]
+
+
+def _parse_audit_output(raw: str) -> Dict[str, Any]:
+    text = raw.strip()
+    if not text:
+        return {"missing_entities": [], "missing_relationships": [], "summary": ""}
 
-    for i, ch in enumerate(chunk_list):
-        raw = raw_outputs[i] or ""
-        parsed = parse_model_output(raw)   # existing parser
+    try:
+        parsed = json.loads(text)
+    except json.JSONDecodeError:
+        match = re.search(r"\{.*\}", text, re.DOTALL)
+        if not match:
+            return {
+                "missing_entities": [],
+                "missing_relationships": [],
+                "summary": "",
+                "raw_response": text,
+            }
+        try:
+            parsed = json.loads(match.group(0))
+        except json.JSONDecodeError:
+            return {
+                "missing_entities": [],
+                "missing_relationships": [],
+                "summary": "",
+                "raw_response": text,
+            }
+
+    if not isinstance(parsed, dict):
+        return {"missing_entities": [], "missing_relationships": [], "summary": "", "raw_response": text}
+
+    return {
+        "missing_entities": parsed.get("missing_entities", []) or [],
+        "missing_relationships": parsed.get("missing_relationships", []) or [],
+        "summary": parsed.get("summary", "") or "",
+    }
 
-        source_id = _require_chunk_uuid(ch)
-        filepath = ch.get("filepath") or ch.get("filename")
 
-        for e in parsed.entities:
-            e["source_id"] = source_id
-            e["filepath"] = filepath
-        for r in parsed.relationships:
-            r["source_id"] = source_id
-            r["filepath"] = filepath
+def extract_from_chunks(
+    chunks: Iterable[Dict[str, Any]],
+    language: Optional[str] = None,
+    entity_types: Optional[Iterable[str]] = None,
+    client: Optional[Chat] = None,
+    storage: Optional[Storage] = None,
+    audit_enabled: Optional[bool] = None,
+) -> Dict[str, Any]:
+    chat = client or Chat.singleton()
+    active_storage = _ensure_storage(storage)
+    chunk_list = list(chunks)
+    resolved_entity_types = list(entity_types or _default_entity_types())
+    do_audit = settings.extraction.audit_second_pass_enabled if audit_enabled is None else audit_enabled
+
+    chunk_languages = [
+        _resolve_chunk_language(chunk, explicit_language=language)
+        for chunk in chunk_list
+    ]
+    chunk_texts = [_get_chunk_text(chunk) for chunk in chunk_list]
+    extraction_prompts = [
+        build_entity_relation_prompt(
+            text=text,
+            language=chunk_language,
+            entity_types=resolved_entity_types,
+        )
+        for text, chunk_language in zip(chunk_texts, chunk_languages)
+    ]
+
+    extraction_outputs = _run_cached_prompts(
+        chat=chat,
+        storage=active_storage,
+        prompts=extraction_prompts,
+        text_hash_inputs=chunk_texts,
+        system_prompt="You extract entities and relationships precisely in the required format. Do not add commentary.",
+    )
 
-        all_entities.extend(parsed.entities)
-        all_relationships.extend(parsed.relationships)
+    all_entities: List[Dict[str, Any]] = []
+    all_relationships: List[Dict[str, Any]] = []
+    all_keywords: List[str] = []
+    chunk_results: List[Dict[str, Any]] = []
+
+    for chunk, chunk_language, raw_output in zip(chunk_list, chunk_languages, extraction_outputs):
+        parsed = parse_model_output(raw_output)
+        source_id = _require_chunk_uuid(chunk)
+        filepath = chunk.get("filepath") or chunk.get("filename")
+
+        entities = []
+        for entity in parsed.entities:
+            stamped = dict(entity)
+            stamped["source_id"] = source_id
+            stamped["filepath"] = filepath
+            entities.append(stamped)
+
+        relationships = []
+        for relationship in parsed.relationships:
+            stamped = dict(relationship)
+            stamped["source_id"] = source_id
+            stamped["filepath"] = filepath
+            relationships.append(stamped)
+
+        all_entities.extend(entities)
+        all_relationships.extend(relationships)
         all_keywords.extend(parsed.content_keywords)
+        chunk_results.append(
+            {
+                "chunk_uuid": source_id,
+                "filepath": filepath,
+                "language": chunk_language,
+                "entities": entities,
+                "relationships": relationships,
+                "content_keywords": parsed.content_keywords,
+                "raw_output": raw_output,
+            }
+        )
+
+    audits: List[Dict[str, Any]] = []
+    if do_audit and chunk_results:
+        audit_prompts = []
+        audit_hash_inputs = []
+        for chunk, chunk_result, chunk_language in zip(chunk_list, chunk_results, chunk_languages):
+            extraction_snapshot = json.dumps(
+                {
+                    "entities": chunk_result["entities"],
+                    "relationships": chunk_result["relationships"],
+                    "content_keywords": chunk_result["content_keywords"],
+                },
+                ensure_ascii=False,
+            )
+            audit_prompts.append(
+                build_entity_audit_prompt(
+                    _get_chunk_text(chunk),
+                    initial_extraction=extraction_snapshot,
+                    language=chunk_language,
+                    entity_types=resolved_entity_types,
+                )
+            )
+            audit_hash_inputs.append(f"{_get_chunk_text(chunk)}\n{extraction_snapshot}")
+
+        audit_outputs = _run_cached_prompts(
+            chat=chat,
+            storage=active_storage,
+            prompts=audit_prompts,
+            text_hash_inputs=audit_hash_inputs,
+            system_prompt="You audit extraction completeness. Return JSON only.",
+        )
+
+        for chunk_result, raw_audit in zip(chunk_results, audit_outputs):
+            parsed_audit = _parse_audit_output(raw_audit)
+            audits.append(
+                {
+                    "chunk_uuid": chunk_result["chunk_uuid"],
+                    "filepath": chunk_result["filepath"],
+                    "language": chunk_result["language"],
+                    **parsed_audit,
+                    "raw_output": raw_audit,
+                }
+            )
 
     return {
         "entities": all_entities,
         "relationships": all_relationships,
         "content_keywords": sorted(set(all_keywords)),
+        "chunk_results": chunk_results,
+        "audits": audits,
     }
 
-# ─────────────────────────────────────────────────────────────
-# CLI (optional) — quick test driver
-# ─────────────────────────────────────────────────────────────
 
-# !! Currently not used, but could be useful for single-chunk extraction
 def extract_entities_relations_for_chunk(
     chunk: Dict[str, Any],
     client: Chat,
     language: Optional[str] = None,
     entity_types: Optional[Iterable[str]] = None,
 ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[str]]:
-    """
-    Run the entity/relationship prompt for a single chunk and parse the result.
-    - Sets entity['source_id'] = chunk['chunk_uuid']
-    - Sets relation['source_id'] = chunk['chunk_uuid']
-    - Also carries 'filepath' (or 'filename') if present
-    """
-    text = _get_chunk_text(chunk)
-    prompt = build_entity_relation_prompt(text=text, language=language, entity_types=entity_types)
-    system = "You extract entities and relationships precisely in the required format. Do not add commentary."
-
-    raw = client.generate(prompt=prompt, system=system)
-    parsed = parse_model_output(raw)
-
-    # Attach source_id (strictly chunk_uuid) and filepath if provided on the chunk
-    source_id = _require_chunk_uuid(chunk)
-    filepath = chunk.get("filepath") or chunk.get("filename")
-
-    for e in parsed.entities:
-        e["source_id"] = source_id
-        e["filepath"] = filepath
-    for r in parsed.relationships:
-        r["source_id"] = source_id
-        r["filepath"] = filepath
-
-    return parsed.entities, parsed.relationships, parsed.content_keywords
+    result = extract_from_chunks(
+        [chunk],
+        language=language,
+        entity_types=entity_types,
+        client=client,
+        audit_enabled=False,
+    )
+    return result["entities"], result["relationships"], result["content_keywords"]
+
 
 def _load_chunks_from_path(path: str) -> List[Dict[str, Any]]:
-    """
-    Load chunks from a JSON or JSONL file.
-    Each record should be a dict with at least 'text' or 'content' and 'chunk_uuid'.
-    """
-    with open(path, "r", encoding="utf-8") as f:
-        data = f.read()
+    with open(path, "r", encoding="utf-8") as file:
+        data = file.read()
     try:
-        # Try JSON array
         obj = json.loads(data)
         if isinstance(obj, list):
-            return obj  # type: ignore[return-value]
+            return obj
         raise ValueError("Expected a list of chunks in JSON.")
     except json.JSONDecodeError:
-        # Try JSONL
         chunks: List[Dict[str, Any]] = []
         for line in data.splitlines():
             line = line.strip()
-            if not line:
-                continue
-            chunks.append(json.loads(line))
+            if line:
+                chunks.append(json.loads(line))
         return chunks
 
 
 def _default_demo_chunks() -> List[Dict[str, Any]]:
-    return [{
-        "chunk_uuid": "demo-1",
-        "text": "Apple launched the Vision Pro with help from Foxconn. Tim Cook presented it in Cupertino during WWDC.",
-        "filename": "/demo/path/a.txt"
-    }]
+    return [
+        {
+            "chunk_uuid": "demo-1",
+            "text": "Apple launched the Vision Pro with help from Foxconn. Tim Cook presented it in Cupertino during WWDC.",
+            "filename": "/demo/path/a.txt",
+        }
+    ]
 
 
 def _print_summary(res: Dict[str, Any]) -> None:
     print("\nEntities:")
-    for e in res["entities"]:
-        print(f"  - {e['name']} [{e['type']}]  src={e.get('source_id')}")
+    for entity in res["entities"]:
+        print(f"  - {entity['name']} [{entity['type']}]  src={entity.get('source_id')}")
     print("\nRelationships:")
-    for r in res["relationships"]:
-        w = r.get("weight")
-        print(f"  - {r['source_name']} <-> {r['target_name']}  w={w}  src={r.get('source_id')}")
+    for relationship in res["relationships"]:
+        print(
+            f"  - {relationship['source_name']} <-> {relationship['target_name']} "
+            f"w={relationship.get('weight')} src={relationship.get('source_id')}"
+        )
     if res["content_keywords"]:
         print("\nKeywords:", ", ".join(res["content_keywords"]))
 
@@ -493,25 +476,25 @@ def _print_summary(res: Dict[str, Any]) -> None:
 if __name__ == "__main__":
     import argparse
 
-    ap = argparse.ArgumentParser(
-        description="Extract entities and relations from chunks using Azure OpenAI and regex parsing."
+    parser = argparse.ArgumentParser(
+        description="Extract entities and relations from chunks using the configured LLM and regex parsing."
     )
-    ap.add_argument("--chunks", type=str, default="", help="Path to JSON/JSONL with chunks (each has text/content and chunk_uuid).")
-    ap.add_argument("--language", type=str, default="", help="Override output language (default from prompts).")
-    ap.add_argument("--entity-types", type=str, default="", help="Comma-separated entity types to enforce.")
-    args = ap.parse_args()
+    parser.add_argument("--chunks", type=str, default="", help="Path to JSON/JSONL with chunks.")
+    parser.add_argument("--language", type=str, default="", help="Override output language.")
+    parser.add_argument("--entity-types", type=str, default="", help="Comma-separated entity types.")
+    args = parser.parse_args()
 
-    language = args.language or None
+    explicit_language = args.language or None
     entity_types = [s.strip() for s in args.entity_types.split(",")] if args.entity_types else None
-
     chunks = _default_demo_chunks() if not args.chunks else _load_chunks_from_path(args.chunks)
 
-    client = Chat.singleton()
-    result = extract_from_chunks(chunks, language=language, entity_types=entity_types, client=client)
-
-    # Pretty print
+    result = extract_from_chunks(
+        chunks,
+        language=explicit_language,
+        entity_types=entity_types,
+        client=Chat.singleton(),
+        audit_enabled=settings.extraction.audit_second_pass_enabled,
+    )
     _print_summary(result)
-
-    # Also dump JSON
     print("\n\nJSON Output:")
-    print(json.dumps(result, ensure_ascii=False, indent=2))
\ No newline at end of file
+    print(json.dumps(result, ensure_ascii=False, indent=2))
diff --git a/graph/fileparser.py b/graph/fileparser.py
index ccc8757..198f23f 100644
--- a/graph/fileparser.py
+++ b/graph/fileparser.py
@@ -9,6 +9,7 @@
 from typing import List, Tuple, Dict, Any, Union
 from langchain_community.document_loaders import BSHTMLLoader
 from langchain_community.document_loaders import TextLoader
+from settings import VALID_EXTENSIONS
 import utils as ut
 import settings
 
@@ -18,8 +19,8 @@ class FileParser:
     and returns pages and metadata in a standardized format.
     """
     
-    def __init__(self):
-        pass
+    # Can we throw this one away? Yes?
+    SUPPORTED_EXTENSIONS = {ext.lower() for ext in VALID_EXTENSIONS}
 
     def parse_file(self, filepath: Union[str, Path]) -> Tuple[List[Tuple[int, str]], Dict[str, Any]]:
         """
@@ -53,10 +54,10 @@ def parse_file(self, filepath: Union[str, Path]) -> Tuple[List[Tuple[int, str]],
         metadata = {
             'doc_id': str(uuid.uuid4()),
             'filename': filepath.name,
-            'filepath': str(filepath.absolute()),
+            'filepath': str(filepath.resolve()),
             'file_size': stat.st_size,
             'last_modified': stat.st_mtime,
-            'created': stat.st_birthtime,
+            'created': getattr(stat, "st_birthtime", stat.st_ctime),
             'extension': extension,
             'mime_type': mimetypes.guess_type(str(filepath))[0]
         }
@@ -83,9 +84,18 @@ def parse_file(self, filepath: Union[str, Path]) -> Tuple[List[Tuple[int, str]],
     
     def _parse_text_file(self, filepath: Path) -> Tuple[List[Tuple[int, str]], Dict[str, Any]]:
         """Parse text files (.txt)"""
-        loader = TextLoader(file_path=filepath, autodetect_encoding=True)
-        text = loader.load()
-        raw_text = text[0].page_content
+        encodings_to_try = ["utf-8", "utf-8-sig", "cp1252", "latin-1"]
+        raw_text = None
+        for encoding in encodings_to_try:
+            try:
+                raw_text = filepath.read_text(encoding=encoding)
+                break
+            except UnicodeDecodeError:
+                continue
+        if raw_text is None:
+            loader = TextLoader(file_path=filepath, autodetect_encoding=False, encoding="utf-8")
+            text = loader.load()
+            raw_text = text[0].page_content
         # txt files do not have multiple pages
         pages = [(0, raw_text)]
         # extract metadata
@@ -220,4 +230,3 @@ def convert_docx_to_pdf(self, docx_path: str) -> str:
         convert(input_path=docx_path, output_path=pdf_path, keep_active=False)
 
         return pdf_path
-
diff --git a/graph/graph_pickle.py b/graph/graph_pickle.py
new file mode 100644
index 0000000..ef9e62a
--- /dev/null
+++ b/graph/graph_pickle.py
@@ -0,0 +1,121 @@
+from __future__ import annotations
+
+import logging
+import pickle
+from pathlib import Path
+from typing import Optional
+
+import networkx as nx
+
+from db_storage import Storage
+
+
+def load_graph_from_pickle(path: Path) -> nx.Graph:
+    with path.open("rb") as handle:
+        graph = pickle.load(handle)
+    if not isinstance(graph, nx.Graph):
+        raise ValueError(f"Pickle at {path} does not contain a NetworkX graph.")
+    return graph
+
+
+def save_graph_to_pickle(graph: nx.Graph, path: Path) -> Path:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("wb") as handle:
+        pickle.dump(graph, handle)
+    return path
+
+
+def build_graph_from_storage(
+    storage: Storage,
+    *,
+    logger: Optional[logging.Logger] = None,
+) -> nx.Graph:
+    graph = nx.Graph()
+
+    with storage.graphdb.connect() as con:
+        node_rows = con.execute(
+            "SELECT name, type, description, source_id, filepath FROM nodes;"
+        ).fetchall()
+        edge_rows = con.execute(
+            "SELECT source_name, target_name, weight, description, keywords, "
+            "source_id, filepath FROM edges;"
+        ).fetchall()
+
+    for name, type_, description, source_id, filepath in node_rows:
+        node_id = (name or "").strip()
+        if not node_id:
+            continue
+
+        chunk_uuids = []
+        if source_id:
+            chunk_uuids = [value.strip() for value in source_id.split("||") if value.strip()]
+
+        graph.add_node(
+            node_id,
+            type=(type_ or "unknown").strip() or "unknown",
+            description=(description or "").strip(),
+            source_id=(source_id or "").strip(),
+            filepath=(filepath or "").strip(),
+            chunk_uuids=chunk_uuids,
+        )
+
+    for source, target, weight, description, keywords, source_id, filepath in edge_rows:
+        src_id = (source or "").strip()
+        tgt_id = (target or "").strip()
+        if not src_id or not tgt_id:
+            continue
+        if src_id not in graph or tgt_id not in graph:
+            if logger is not None:
+                logger.debug("Skipping edge with missing endpoints: %s -> %s", src_id, tgt_id)
+            continue
+
+        chunk_uuids = []
+        if source_id:
+            chunk_uuids = [value.strip() for value in source_id.split("||") if value.strip()]
+
+        graph.add_edge(
+            src_id,
+            tgt_id,
+            weight=float(weight) if weight is not None else 1.0,
+            description=(description or "").strip(),
+            keywords=(keywords or "").strip(),
+            source_id=(source_id or "").strip(),
+            filepath=(filepath or "").strip(),
+            chunk_uuids=chunk_uuids,
+        )
+
+    if logger is not None:
+        logger.debug(
+            "Loaded graph snapshot with %d nodes and %d edges",
+            graph.number_of_nodes(),
+            graph.number_of_edges(),
+        )
+    return graph
+
+
+def load_or_build_graph_snapshot(
+    storage: Storage,
+    *,
+    snapshot_path: Optional[Path] = None,
+    logger: Optional[logging.Logger] = None,
+) -> nx.Graph:
+    if snapshot_path is not None and snapshot_path.exists():
+        try:
+            graph = load_graph_from_pickle(snapshot_path)
+            if logger is not None:
+                logger.debug("Loaded graph snapshot from pickle: %s", snapshot_path)
+            return graph
+        except Exception as exc:
+            if logger is not None:
+                logger.warning(
+                    "Failed to load graph snapshot pickle at %s; rebuilding from SQLite (%s)",
+                    snapshot_path,
+                    exc,
+                )
+
+    graph = build_graph_from_storage(storage, logger=logger)
+    if snapshot_path is not None:
+        save_graph_to_pickle(graph, snapshot_path)
+        if logger is not None:
+            logger.debug("Saved graph snapshot pickle to %s", snapshot_path)
+    return graph
diff --git a/graph/ingestion.py b/graph/ingestion.py
index 834bde4..cb7e0ae 100644
--- a/graph/ingestion.py
+++ b/graph/ingestion.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
+import json
 import mimetypes
 import uuid
 from pathlib import Path
-from typing import Dict, Any, List, Sequence, Tuple, Optional
+from typing import Callable, Dict, Any, List, Sequence, Tuple, Optional, Union
 from collections import Counter, defaultdict
 import os
 # local imports
@@ -10,10 +11,26 @@
 from fileparser import FileParser
 from chunker import chunk_parsed_pages
 from extractor import extract_from_chunks
+from graph_pickle import save_graph_to_pickle, build_graph_from_storage
 from llm import llm_summarize_text
-import settings
+from settings import VALID_EXTENSIONS, settings
+from logging_utils import configure_file_logger
+from project_paths import (
+    ProjectPaths,
+    ensure_project_dirs,
+    list_document_paths,
+    resolve_project_paths,
+)
 import hashlib
 
+
+LOGGER = configure_file_logger(
+    "appl_kgraph.ingestion",
+    log_file=Path("ingestion.log"),
+    level=settings.logging.ingestion_level,
+    enabled=settings.logging.ingestion_enabled,
+)
+
 #--------------------------------------------------
 # Helpers 
 #--------------------------------------------------
@@ -104,6 +121,9 @@ def build_chunks(
     pages: Sequence[Tuple[int, str]],
     doc_id: str,
     filename: str,
+    *,
+    filepath: Optional[str] = None,
+    document_language: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
     """
     Normalize chunks for storage:
@@ -126,6 +146,8 @@ def build_chunks(
             "doc_id": doc_id,
             "chunk_id": int(c.get("chunk_id", i)),
             "filename": filename,
+            "filepath": filepath,
+            "document_language": document_language,
             "text": text,
             "char_count": int(c.get("char_count", len(text))),
             "start_page": start,
@@ -133,6 +155,56 @@ def build_chunks(
         })
     return norm
 
+
+def _resolve_runtime_project_paths(
+    *,
+    paths: Sequence[Path],
+    documents_root: Optional[Union[Path, str]] = None,
+    project_paths: Optional[ProjectPaths] = None,
+) -> Optional[ProjectPaths]:
+    if project_paths is not None:
+        return project_paths
+    if documents_root is not None:
+        return resolve_project_paths(documents_root)
+    return None
+
+
+def _configure_ingestion_logger(project_paths: Optional[ProjectPaths]) -> None:
+    log_file = (
+        project_paths.ingestion_log_file
+        if project_paths is not None
+        else Path("ingestion.log")
+    )
+    configure_file_logger(
+        "appl_kgraph.ingestion",
+        log_file=log_file,
+        level=settings.logging.ingestion_level,
+        enabled=settings.logging.ingestion_enabled,
+    )
+
+
+def _write_extraction_audits(
+    project_paths: Optional[ProjectPaths],
+    filename: str,
+    audits: List[Dict[str, Any]],
+) -> None:
+    if project_paths is None or not audits:
+        return
+    ensure_project_dirs(project_paths)
+    target = project_paths.extraction_audits_dir / f"{Path(filename).stem}.audit.json"
+    target.write_text(json.dumps(audits, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def _write_retrieval_graph_snapshot(
+    storage: Storage,
+    project_paths: Optional[ProjectPaths],
+) -> Optional[Path]:
+    if project_paths is None:
+        return None
+    ensure_project_dirs(project_paths)
+    graph = build_graph_from_storage(storage, logger=LOGGER)
+    return save_graph_to_pickle(graph, project_paths.retrieval_graph_pickle_file)
+
 def _resolve_type(votes: Counter, existing_type: str = "") -> str:
     existing = (existing_type or "").strip()
     if not votes:
@@ -467,22 +539,22 @@ def remove_document_from_storage(storage: Storage, filename: str) -> None:
     # Get the document to retrieve its doc_id
     doc = storage.get_document_by_filename(filename)
     if not doc or not doc.get("doc_id"):
-        print(f"Document {filename} not found in storage.")
+        LOGGER.warning("Document %s not found in storage", filename)
         return
 
     doc_id = doc["doc_id"]
-    print(f"Removing document: {filename} (doc_id: {doc_id})")
+    LOGGER.info("Removing document %s (doc_id=%s)", filename, doc_id)
 
     # Step 1: Get all chunks associated with this document
     chunks = storage.get_chunks_by_doc_id(doc_id)
     if not chunks:
-        print(f"No chunks found for document {filename}")
+        LOGGER.info("No chunks found for document %s", filename)
         # Still proceed to delete the document itself
         storage.delete_document(doc_id)
         return
 
     chunk_uuids = [c["chunk_uuid"] for c in chunks]
-    print(f"Found {len(chunk_uuids)} chunks to process")
+    LOGGER.info("Found %d chunks to process for %s", len(chunk_uuids), filename)
 
     # Step 2: Process GraphDB - update nodes and edges
     delim = settings.settings.ingestion.delimiter
@@ -532,7 +604,7 @@ def remove_document_from_storage(storage: Storage, filename: str) -> None:
     for n in nodes:
         name = n["name"]
         if n["remaining_source_ids"] < 0:
-            print(f"Warning: Node {name} has negative remaining_source_ids")
+            LOGGER.warning("Node %s has negative remaining_source_ids", name)
         elif n["remaining_source_ids"] == 0:
             nodes_to_delete.append(name)
         else:
@@ -551,7 +623,7 @@ def remove_document_from_storage(storage: Storage, filename: str) -> None:
     for e in edges:
         edge_pair = (e["source_name"], e["target_name"])
         if e["remaining_source_ids"] < 0:
-            print(f"Warning: Edge {edge_pair} has negative remaining_source_ids")
+            LOGGER.warning("Edge %s has negative remaining_source_ids", edge_pair)
         elif e["remaining_source_ids"] == 0:
             edges_to_delete.append(edge_pair)
         else:
@@ -570,54 +642,54 @@ def remove_document_from_storage(storage: Storage, filename: str) -> None:
 
     # Apply graph updates
     if nodes_to_update:
-        print(f"Updating {len(nodes_to_update)} nodes with source_id changes")
+        LOGGER.info("Updating %d nodes with source_id changes", len(nodes_to_update))
         storage.graphdb.update_nodes(nodes_to_update)
 
     if nodes_to_delete:
-        print(f"Deleting {len(nodes_to_delete)} nodes with no remaining source_ids")
+        LOGGER.info("Deleting %d nodes with no remaining source_ids", len(nodes_to_delete))
         storage.delete_nodes(nodes_to_delete)
 
     if edges_to_update:
-        print(f"Updating {len(edges_to_update)} edges with source_id changes")
+        LOGGER.info("Updating %d edges with source_id changes", len(edges_to_update))
         storage.graphdb.update_edges(edges_to_update)
 
     if edges_to_delete:
-        print(f"Deleting {len(edges_to_delete)} edges with no remaining source_ids")
+        LOGGER.info("Deleting %d edges with no remaining source_ids", len(edges_to_delete))
         storage.delete_edges(edges_to_delete)
 
     # Step 3: Update EntityVectors - remove source_ids and delete if empty
     # We need to get all entities and check their metadata
     # Since we deleted nodes, we also need to delete their vectors
     if nodes_to_delete:
-        print(f"Removing {len(nodes_to_delete)} entity vectors")
+        LOGGER.info("Removing %d entity vectors", len(nodes_to_delete))
         storage.delete_entity_vector(nodes_to_delete)
 
     # For updated nodes, we need to upsert them in the vector DB
     if nodes_to_update:
-        print(f"Updating {len(nodes_to_update)} entity vectors")
+        LOGGER.info("Updating %d entity vectors", len(nodes_to_update))
         storage.upsert_entity_vector(nodes_to_update)
 
     # Step 4: Update RelationVectors - remove source_ids and delete if empty
     if edges_to_delete:
-        print(f"Removing {len(edges_to_delete)} relation vectors")
+        LOGGER.info("Removing %d relation vectors", len(edges_to_delete))
         storage.delete_relation_vector(edges_to_delete)
 
     # For updated edges, we need to upsert them in the vector DB
     if edges_to_update:
-        print(f"Updating {len(edges_to_update)} relation vectors")
+        LOGGER.info("Updating %d relation vectors", len(edges_to_update))
         storage.upsert_relation_vector(edges_to_update)
 
     # Step 5: Remove chunk vectors
-    print(f"Removing {len(chunk_uuids)} chunk vectors")
+    LOGGER.info("Removing %d chunk vectors", len(chunk_uuids))
     for chunk_uuid in chunk_uuids:
         storage.delete_chunk_vector(chunk_uuid)
 
     # Step 6: Remove chunks from ChunksDB
-    print(f"Removing {len(chunk_uuids)} chunks from ChunksDB")
+    LOGGER.info("Removing %d chunks from ChunksDB", len(chunk_uuids))
     storage.delete_chunks_by_uuids(chunk_uuids)
 
     # Step 7: Remove document from DocumentsDB
-    print("Removing document from DocumentsDB")
+    LOGGER.info("Removing document from DocumentsDB")
     storage.delete_document(doc_id)
 
     # Step 8: Sanity check
@@ -637,15 +709,27 @@ def remove_document_from_storage(storage: Storage, filename: str) -> None:
     for e in edges_after:
         src, tgt = e.get("source_name"), e.get("target_name")
         if src not in node_names_after or tgt not in node_names_after:
-            print(f"Sanity Check Warning: Edge ({src}, {tgt}) exists without corresponding nodes after deletion.")
+            LOGGER.warning(
+                "Sanity check warning: edge (%s, %s) exists without corresponding nodes after deletion",
+                src,
+                tgt,
+            )
             storage.delete_edges( [(src, tgt)] )
             storage.delete_relation_vector( [(src, tgt)] )
-            print(f"Removed edge ({src}, {tgt}) due to missing nodes.")
+            LOGGER.info("Removed edge (%s, %s) due to missing nodes", src, tgt)
 
-    print(f"Successfully removed document {filename} and all associated data")
+    LOGGER.info("Successfully removed document %s and all associated data", filename)
 
 
-def ingest_paths(paths: List[Path]):
+def ingest_paths(
+    paths: List[Path],
+    *,
+    documents_root: Optional[Union[Path, str]] = None,
+    project_paths: Optional[ProjectPaths] = None,
+    storage_paths=None,
+    audit_enabled: Optional[bool] = None,
+    progress_callback: Optional[Callable[[str], None]] = None,
+) -> Dict[str, Any]:
     """
     Ingests files from given paths into the knowledge graph storage system.
 
@@ -658,39 +742,80 @@ def ingest_paths(paths: List[Path]):
     Returns:
         None
     """
-    storage = Storage()
+    def report(message: str) -> None:
+        LOGGER.info(message)
+        if progress_callback is not None:
+            progress_callback(message)
+
+    active_project_paths = _resolve_runtime_project_paths(
+        paths=paths,
+        documents_root=documents_root,
+        project_paths=project_paths,
+    )
+    if active_project_paths is not None:
+        ensure_project_dirs(active_project_paths)
+    _configure_ingestion_logger(active_project_paths)
+
+    effective_storage_paths = (
+        active_project_paths.storage
+        if active_project_paths is not None
+        else storage_paths
+    )
+
+    report("Initializing project storage")
+    storage = Storage(paths=effective_storage_paths)
     storage.init()
+    LOGGER.info("Starting ingestion for %d paths", len(paths))
+    report(f"Queued {len(paths)} files for ingestion")
 
     all_chunks: List[Dict[str, Any]] = []
     all_entities: List[Dict[str, Any]]  = []
     all_relations: List[Dict[str, Any]]  = []
+    processed_files = 0
+    skipped_files = 0
+    removed_files = 0
 
+    report("Scanning existing project documents")
     # Remove documents that are no longer present
     all_existing_docs = storage.get_all_documents()
     existing_filenames = {doc["filename"] for doc in all_existing_docs if doc.get("filename")}
     files_to_be_removed = existing_filenames - {p.name for p in paths}
     for fname in files_to_be_removed:
+        report(f"Removing stale document {fname}")
         remove_document_from_storage(storage, fname)
+        removed_files += 1
 
-    for p in paths:
-        print(f"Processing file: {p}\n")
+    total_paths = len(paths)
+    for index, p in enumerate(paths, start=1):
+        step_prefix = f"{index}/{total_paths} {p.name}"
+        report(f"{step_prefix} - checking file")
         if not p.exists() or not p.is_file():
+            report(f"{step_prefix} - skipped (path missing or not a file)")
+            skipped_files += 1
             continue
         content_hash = file_sha256(p)
         if should_skip_ingestion(storage, p, content_hash):
-            print(f"Skipping {p.name} (unchanged).")
+            LOGGER.info("Skipping %s (unchanged)", p.name)
+            report(f"{step_prefix} - skipped (unchanged)")
+            skipped_files += 1
             continue
         # skip temporary files created by ms word
         if ((p.name.lower().startswith("~$") and p.name.lower().endswith((".docx", ".doc"))) or
             (p.name.lower().endswith((".tmp", ".temp")) and "word" in p.name.lower())):
-            print(f"Skipping temporary file {p.name}.")
+            LOGGER.info("Skipping temporary file %s", p.name)
+            report(f"{step_prefix} - skipped (temporary file)")
+            skipped_files += 1
             continue
+        report(f"{step_prefix} - parsing file")
         pages, file_meta = parse_to_pages(p)
         if not pages or not file_meta:
-            print(f"Skipping {p} due to parsing error.")
+            LOGGER.warning("Skipping %s due to parsing error", p)
+            report(f"{step_prefix} - parsing failed")
+            skipped_files += 1
             continue
         doc_exists = storage.get_document_by_filename(p.name).get("filename") == p.name if storage.get_document_by_filename(p.name) else False
         if doc_exists: # document exists but content hash differs.
+            report(f"{step_prefix} - replacing changed document")
             remove_document_from_storage(storage, p.name)
 
         file_meta = normalize_metadata(file_meta)
@@ -701,10 +826,10 @@ def ingest_paths(paths: List[Path]):
         doc_meta = {
             "doc_id": str(uuid.uuid4()),
             "filename": p.name,
-            "filepath": str(p),                          # keep if useful for tracing
+            "filepath": str(p.resolve()),
             "file_size": st.st_size,
             "last_modified": st.st_mtime,
-            "created": st.st_birthtime,                      
+            "created": getattr(st, "st_birthtime", st.st_ctime),
             "extension": p.suffix.lower(),
             "mime_type": ((file_meta or {}).get("mime_type") or mimetypes.guess_type(str(p))[0] or ""),
             "language": (file_meta or {}).get("language", "unknown"),
@@ -712,15 +837,29 @@ def ingest_paths(paths: List[Path]):
             "full_char_count": len(full_text),
         }
 
+        report(f"{step_prefix} - storing document")
         storage.add_document(doc_meta, full_text)  # from storage.py
 
-        chunks = build_chunks(pages, doc_meta["doc_id"], doc_meta["filename"])
+        report(f"{step_prefix} - building chunks")
+        chunks = build_chunks(
+            pages,
+            doc_meta["doc_id"],
+            doc_meta["filename"],
+            filepath=doc_meta["filepath"],
+            document_language=doc_meta["language"],
+        )
+        report(f"{step_prefix} - storing {len(chunks)} chunks")
         storage.add_chunks(chunks)  # from storage.py
         all_chunks.extend(chunks)
 
         # Extract entities and relations from chunks
         # res['entities'], res['relationships'], res['content_keywords']
-        res = extract_from_chunks(chunks)  # from extractor.py
+        report(f"{step_prefix} - extracting entities and relations")
+        res = extract_from_chunks(
+            chunks,
+            storage=storage,
+            audit_enabled=audit_enabled,
+        )
         
         # Consolidate/merge entities (by (name,type)) and upsert those first
         entities_in = res.get("entities", []) or []
@@ -731,46 +870,75 @@ def ingest_paths(paths: List[Path]):
         if placeholders:
             all_entities.extend(placeholders)           # collect for vector DB later
 
+        report(f"{step_prefix} - merging graph data")
         nodes, edges = merge_graph_data(storage, entities_in, edges_in)
 
         if nodes:
+            report(f"{step_prefix} - writing {len(nodes)} entities")
             storage.upsert_nodes(nodes)                 # write schema
             all_entities.extend(nodes)                  # collect for vector DB later
 
         # Group/merge edges and upsert
         if edges:
+            report(f"{step_prefix} - writing {len(edges)} relations")
             storage.upsert_edges(edges)                 # write schema
             all_relations.extend(edges)                 # collect for vector DB later
+        if res.get("audits"):
+            report(f"{step_prefix} - writing extraction audit")
+        _write_extraction_audits(active_project_paths, p.name, res.get("audits", []) or [])
+        report(f"{step_prefix} - completed")
+        processed_files += 1
 
     # Finally, add all chunks, entities, and relations to vector DB   
     if all_chunks:
+        report("Writing chunk vectors")
         storage.upsert_chunk_vector(all_chunks) # from storage.py
         deduped_entities = dedupe_entities_for_vectors(all_entities)
-        # if all_chunks:
-        #     print(f"[ingestion] sample chunk: {all_chunks[0]}")
         if deduped_entities:
+            report("Writing entity vectors")
             storage.upsert_entity_vector(deduped_entities)
         if all_relations:
+            report("Writing relation vectors")
             storage.upsert_relation_vector(all_relations)
+    report("Writing retrieval snapshot")
+    retrieval_snapshot = _write_retrieval_graph_snapshot(storage, active_project_paths)
+    report("Completed ingestion")
+    LOGGER.info(
+        "Completed ingestion: processed=%d skipped=%d removed=%d chunks=%d entities=%d relations=%d snapshot=%s",
+        processed_files,
+        skipped_files,
+        removed_files,
+        len(all_chunks),
+        len(all_entities),
+        len(all_relations),
+        retrieval_snapshot,
+    )
+    return {
+        "documents_root": str(active_project_paths.documents_root) if active_project_paths else None,
+        "project_root": str(active_project_paths.project_root) if active_project_paths else None,
+        "retrieval_graph_pickle": str(retrieval_snapshot) if retrieval_snapshot is not None else None,
+        "processed_files": processed_files,
+        "skipped_files": skipped_files,
+        "removed_files": removed_files,
+        "chunk_count": len(all_chunks),
+        "entity_count": len(all_entities),
+        "relation_count": len(all_relations),
+    }
 
 
 def main():
-    # root = Path('docs')
-    fileparser = FileParser()
-    # Get source folder with docs from user
-    content_folder_path = input("Source folder path of documents (including path): ")
-    if not content_folder_path or not os.path.isdir(content_folder_path):
-        print("Please enter a valid folder path.")
+    folder_input = input(
+        "Source folder path of documents (leave blank for ./docs): "
+    ).strip()
+    root = Path(folder_input) if folder_input else Path("docs")
+    paths = list_document_paths(root)
+    if not paths:
+        print(
+            f"No supported files found in {root.resolve()} "
+            f"with extensions: {', '.join(VALID_EXTENSIONS)}."
+        )
         return
-    else:
-        paths = [Path(os.path.join(content_folder_path, f)) for f in os.listdir(content_folder_path) 
-                 if ((os.path.isfile(os.path.join(content_folder_path, f))) and (Path(f).suffix in settings.VALID_EXTENSIONS))]
-        if not paths:
-            print(f"📂 No files found with extensions: {', '.join(settings.VALID_EXTENSIONS)}.")
-            return
-        else:
-            ingest_paths(paths)
-
+    ingest_paths(paths, documents_root=root)
 
 if __name__ == "__main__":
     main()
diff --git a/graph/lightrag.py b/graph/lightrag.py
index ba21462..b1cfb4e 100644
--- a/graph/lightrag.py
+++ b/graph/lightrag.py
@@ -7,6 +7,7 @@
 from dataclasses import dataclass
 from functools import lru_cache
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
+from pathlib import Path
 
 import networkx as nx
 import tiktoken
@@ -17,6 +18,10 @@
 from db_storage import StoragePaths
 from llm import Chat
 from prompts import PROMPTS
+from logging_utils import configure_file_logger
+from graph_pickle import load_or_build_graph_snapshot
+from project_paths import ProjectPaths
+from query_logging import write_query_log
 
 
 LOGGER = logging.getLogger("LightRAG")
@@ -72,13 +77,13 @@ def render_full_context(result: RetrievalResult) -> str:
 # Logging and token helpers
 # ---------------------------------------------------------------------------
 
-def set_logger(log_file: str) -> None:
-    """Configure the package wide logger."""
-    LOGGER.setLevel(logging.INFO)
-    handler = logging.FileHandler(log_file)
-    handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
-    if not LOGGER.handlers:
-        LOGGER.addHandler(handler)
+def set_logger(log_file: Path) -> None:
+    configure_file_logger(
+        "LightRAG",
+        log_file=log_file,
+        level=settings.logging.retrieval_level,
+        enabled=settings.logging.retrieval_enabled,
+    )
 
 
 @lru_cache(maxsize=4)
@@ -164,10 +169,16 @@ class GraphSnapshot:
 class StorageAdapter:
     """High level helper around the ingestion Storage facade."""
 
-    def __init__(self, paths: Optional[StoragePaths] = None):
+    def __init__(
+        self,
+        paths: Optional[StoragePaths] = None,
+        *,
+        graph_pickle_path: Optional[Path] = None,
+    ):
         self._storage = Storage(paths=paths)
         self._storage.init()
         self._graph_snapshot: Optional[GraphSnapshot] = None
+        self._graph_pickle_path = graph_pickle_path
 
     @property
     def graph(self) -> nx.Graph:
@@ -180,156 +191,66 @@ def refresh_graph(self) -> None:
         self._graph_snapshot = self._load_graph()
 
     def _load_graph(self) -> GraphSnapshot:
-        """Load graph from storage into NetworkX."""
-        graph = nx.Graph()
-
-        with self._storage.graphdb.connect() as con:
-            node_rows = con.execute(
-                "SELECT name, type, description, source_id, filepath FROM nodes;"
-            ).fetchall()
-            edge_rows = con.execute(
-                "SELECT source_name, target_name, weight, description, keywords, "
-                "source_id, filepath FROM edges;"
-            ).fetchall()
-
-        # Add nodes with chunk_uuid list
-        for name, type_, description, source_id, filepath in node_rows:
-            node_id = (name or "").strip()
-            if not node_id:
-                continue
-
-            # Parse source_id to get chunk_uuids
-            chunk_uuids = []
-            if source_id:
-                chunk_uuids = [s.strip() for s in source_id.split("||") if s.strip()]
-
-            graph.add_node(
-                node_id,
-                type=(type_ or "unknown").strip() or "unknown",
-                description=(description or "").strip(),
-                source_id=(source_id or "").strip(),
-                filepath=(filepath or "").strip(),
-                chunk_uuids=chunk_uuids,
-            )
-
-        # Add edges with chunk_uuid list
-        for row in edge_rows:
-            source, target, weight, description, keywords, source_id, filepath = row
-            src_id = (source or "").strip()
-            tgt_id = (target or "").strip()
-            if not src_id or not tgt_id:
-                continue
-            if src_id not in graph or tgt_id not in graph:
-                LOGGER.debug("Skipping edge with missing endpoints: %s -> %s", src_id, tgt_id)
-                continue
-
-            # Parse source_id to get chunk_uuids
-            chunk_uuids = []
-            if source_id:
-                chunk_uuids = [s.strip() for s in source_id.split("||") if s.strip()]
-
-            graph.add_edge(
-                src_id,
-                tgt_id,
-                weight=float(weight) if weight is not None else 1.0,
-                description=(description or "").strip(),
-                keywords=(keywords or "").strip(),
-                source_id=(source_id or "").strip(),
-                filepath=(filepath or "").strip(),
-                chunk_uuids=chunk_uuids,
-            )
-
-        LOGGER.debug(
-            "Loaded graph snapshot with %d nodes and %d edges",
-            graph.number_of_nodes(),
-            graph.number_of_edges(),
+        graph = load_or_build_graph_snapshot(
+            self._storage,
+            snapshot_path=self._graph_pickle_path,
+            logger=LOGGER,
         )
         return GraphSnapshot(graph=graph)
 
     def query_entities(self, text: str, limit: int = 5) -> List[Dict[str, Any]]:
         """Query entity vector index and return entity information with IDs."""
-        results = self._storage.entity_vectors.query(text=text, n_results=limit) or []
+        results = self._storage.search_entities(text=text, n_results=limit) or []
         matches: List[Dict[str, Any]] = []
         if not results:
             return matches
 
-        ids = results[0].get("ids", [])
-        metadatas = results[0].get("metadatas", [])
-        distances = results[0].get("distances", [])
-
-        for i, metadata in enumerate(metadatas):
+        for metadata in results:
             if not isinstance(metadata, dict):
                 continue
-            entity_id = ids[i] if i < len(ids) else ""
-            distance = distances[i] if i < len(distances) else None
             matches.append({
-                "id": entity_id,
-                "name": metadata.get("name", entity_id),
+                "id": metadata.get("name", ""),
+                "name": metadata.get("name", ""),
                 "type": metadata.get("type"),
                 "description": metadata.get("description", ""),
-                "score": _distance_to_similarity(distance),
+                "score": float(metadata.get("score", 0.0) or 0.0),
             })
         return matches
 
     def query_relations(self, text: str, limit: int = 5) -> List[Dict[str, Any]]:
         """Query relation vector index and return relation information with IDs."""
-        results = self._storage.relation_vectors.query(text=text, n_results=limit) or []
+        results = self._storage.search_relations(text=text, n_results=limit) or []
         matches: List[Dict[str, Any]] = []
         if not results:
             return matches
 
-        ids = results[0].get("ids", [])
-        metadatas = results[0].get("metadatas", [])
-        distances = results[0].get("distances", [])
-
-        for i, metadata in enumerate(metadatas):
+        for metadata in results:
             if not isinstance(metadata, dict):
                 continue
-            relation_id = ids[i] if i < len(ids) else ""
-            distance = distances[i] if i < len(distances) else None
             matches.append({
-                "id": relation_id,
+                "id": f"{metadata.get('source_name', '')}::{metadata.get('target_name', '')}",
                 "source_name": metadata.get("source_name", ""),
                 "target_name": metadata.get("target_name", ""),
                 "description": metadata.get("description", ""),
                 "keywords": metadata.get("keywords", ""),
-                "score": _distance_to_similarity(distance),
+                "score": float(metadata.get("score", 0.0) or 0.0),
             })
         return matches
 
     def query_chunks(self, text: str, limit: int = 5) -> List[Dict[str, Any]]:
         """Query chunk vector index and return chunk information."""
-        results = self._storage.chunk_vectors.query(text=text, n_results=limit) or []
+        results = self._storage.search_chunks(text=text, n_results=limit) or []
         matches: List[Dict[str, Any]] = []
-
-        for result in results:
-            metadatas = self._as_list(result.get("metadatas"))
-            ids = self._as_list(result.get("ids"))
-            distances = self._as_list(result.get("distances"))
-            documents = self._as_list(result.get("documents"))
-
-            max_len = max(
-                (len(seq) for seq in (metadatas, ids, distances, documents) if seq),
-                default=0,
-            )
-
-            for index in range(max_len):
-                metadata = metadatas[index] if index < len(metadatas) else {}
-                if not isinstance(metadata, dict):
-                    metadata = {}
-                chunk_id = ids[index] if index < len(ids) else ""
-                distance = distances[index] if index < len(distances) else None
-                document = documents[index] if index < len(documents) else ""
-                if not isinstance(document, str):
-                    document = str(document or "")
-
-                matches.append({
-                    "chunk_uuid": str(chunk_id),
-                    "document_id": str(metadata.get("doc_id", "")),
-                    "filename": str(metadata.get("filename", "")),
-                    "text": document,
-                    "score": _distance_to_similarity(distance),
-                })
+        for metadata in results:
+            if not isinstance(metadata, dict):
+                continue
+            matches.append({
+                "chunk_uuid": str(metadata.get("chunk_uuid", "")),
+                "document_id": str(metadata.get("doc_id", "")),
+                "filename": str(metadata.get("filename", "")),
+                "text": str(metadata.get("text", "")),
+                "score": float(metadata.get("score", 0.0) or 0.0),
+            })
         return matches
 
     def get_chunk_by_uuid(self, chunk_uuid: str) -> Optional[Dict[str, Any]]:
@@ -418,7 +339,7 @@ async def extract_keywords(
         Tuple of (hl_keywords, ll_keywords)
     """
     examples = "\n".join(PROMPTS["keywords_extraction_examples"])
-    language = PROMPTS["DEFAULT_LANGUAGE"]
+    language = settings.prompts.default_language
     history_context = get_conversation_turns(conversation_history, history_turns)
 
     prompt = PROMPTS["keywords_extraction"].format(
@@ -652,6 +573,9 @@ def get_vector_context(
     for chunk in chunk_matches:
         all_chunks.append({
             "id": chunk["chunk_uuid"],
+            "chunk_uuid": chunk["chunk_uuid"],
+            "document_id": chunk.get("document_id", ""),
+            "filename": chunk.get("filename", ""),
             "text": chunk["text"],
             "source_type": "vector",
             "score": chunk.get("score", 0.0),
@@ -708,6 +632,9 @@ def extract_chunks_from_nodes(
         if chunk:
             result_chunks.append({
                 "id": chunk["chunk_uuid"],
+                "chunk_uuid": chunk["chunk_uuid"],
+                "document_id": chunk.get("doc_id", ""),
+                "filename": chunk.get("filename", ""),
                 "text": chunk.get("text", ""),
                 "order": item["index"],
                 "relation": item["relation_score"],
@@ -749,6 +676,9 @@ def extract_chunks_from_edges(
         if chunk:
             result_chunks.append({
                 "id": chunk["chunk_uuid"],
+                "chunk_uuid": chunk["chunk_uuid"],
+                "document_id": chunk.get("doc_id", ""),
+                "filename": chunk.get("filename", ""),
                 "text": chunk.get("text", ""),
                 "order": index,
                 "source_type": "relationship",
@@ -946,10 +876,15 @@ def lightrag_prompt(
     
     context = naive_context if settings.retrieval.light_mode == "naive" else kg_context + naive_context
     history_context = get_conversation_turns(history)
-    user_prompt = PROMPTS["DEFAULT_USER_PROMPT"]
-    sys_prompt_template = PROMPTS["lightrag_response"] if settings.retrieval.light_mode != "naive" else PROMPTS["rag_response_naive"]
+    user_prompt = settings.prompts.default_user_prompt
+    sys_prompt_template = (
+        PROMPTS["lightrag_response"]
+        if settings.retrieval.light_mode != "naive"
+        else PROMPTS["naive_rag_response"]
+    )
     sys_prompt = sys_prompt_template.format(
         context_data=context,
+        content_data=context,
         response_type=settings.retrieval.response_type,
         history=history_context,
         user_prompt=user_prompt,
@@ -958,6 +893,25 @@ def lightrag_prompt(
     return sys_prompt
 
 
+def _retrieval_model_metadata() -> Dict[str, Any]:
+    provider = settings.provider.provider
+    if provider == "azure":
+        return {
+            "provider": provider,
+            "model_name": settings.provider.azure_llm_deployment,
+            "model_version": settings.provider.azure_api_version,
+            "api_version": settings.provider.azure_api_version,
+            "endpoint": settings.provider.azure_endpoint,
+        }
+    return {
+        "provider": provider,
+        "model_name": settings.provider.openai_llm_model,
+        "model_version": settings.provider.openai_llm_model,
+        "api_version": "",
+        "endpoint": settings.provider.openai_base_url or "https://api.openai.com/v1",
+    }
+
+
 # ---------------------------------------------------------------------------
 # LightRAG entry point
 # ---------------------------------------------------------------------------
@@ -975,12 +929,35 @@ def __init__(
         self,
         *,
         storage_paths: Optional[StoragePaths] = None,
+        project_paths: Optional[ProjectPaths] = None,
         system_prompt: Optional[str] = None,
-        log_file: str = "LightRAG.log",
+        log_file: Optional[str] = None,
     ) -> None:
-        set_logger(log_file)
+        self._project_paths = project_paths
+        effective_storage_paths = (
+            storage_paths
+            if storage_paths is not None
+            else (project_paths.storage if project_paths is not None else None)
+        )
+        effective_log_file = (
+            Path(log_file)
+            if log_file is not None
+            else (
+                project_paths.lightrag_log_file
+                if project_paths is not None
+                else Path("LightRAG.log")
+            )
+        )
+        set_logger(effective_log_file)
         LOGGER.info("Initialising LightRAG retriever")
-        self._storage = StorageAdapter(paths=storage_paths)
+        self._storage = StorageAdapter(
+            paths=effective_storage_paths,
+            graph_pickle_path=(
+                project_paths.retrieval_graph_pickle_file
+                if project_paths is not None
+                else None
+            ),
+        )
         self._chat = RetrieveChat(system_prompt=system_prompt)
 
     async def aretrieve(
@@ -1004,7 +981,14 @@ async def aretrieve(
         # Handle empty keywords
         if hl_keywords == [] and ll_keywords == []:
             LOGGER.warning("low_level_keywords and high_level_keywords is empty")
-            return PROMPTS["fail_response"]
+            return RetrievalResult(
+                answer=PROMPTS["fail_response"],
+                entities_context=[],
+                relations_context=[],
+                all_chunks=[],
+                hl_keywords=[],
+                ll_keywords=[],
+            )
         if ll_keywords == [] and retrieval_mode in ["local", "hybrid"]:
             LOGGER.warning(f"low_level_keywords is empty, switching from {retrieval_mode} mode to global mode")
             retrieval_mode = "global"
@@ -1041,7 +1025,7 @@ async def aretrieve(
             temperature=settings.retrieval.llm_temperature,
         )
 
-        return RetrievalResult(
+        result = RetrievalResult(
             answer=answer,
             entities_context=entities_context,
             relations_context=relations_context,
@@ -1049,6 +1033,33 @@ async def aretrieve(
             hl_keywords=hl_keywords,
             ll_keywords=ll_keywords,
         )
+        if settings.logging.qa_enabled:
+            write_query_log(
+                project_paths=self._project_paths,
+                retriever_name="lightrag",
+                payload={
+                    "question": question,
+                    "answer": answer,
+                    "active_documents_root": str(self._project_paths.documents_root)
+                    if self._project_paths
+                    else None,
+                    "conversation_history": conversation_history or [],
+                    "retrieval_metadata": {
+                        "retrieval_mode": retrieval_mode,
+                        "response_type": settings.retrieval.response_type,
+                        "entity_top_k": settings.retrieval.entity_top_k,
+                        "relation_top_k": settings.retrieval.relation_top_k,
+                        "chunk_top_k": settings.retrieval.chunk_top_k,
+                    },
+                    "model": _retrieval_model_metadata(),
+                    "high_level_keywords": hl_keywords,
+                    "low_level_keywords": ll_keywords,
+                    "retrieved_entities": entities_context,
+                    "retrieved_relationships": relations_context,
+                    "retrieved_chunks": all_chunks,
+                },
+            )
+        return result
 
     def retrieve(
         self,
diff --git a/graph/logging_utils.py b/graph/logging_utils.py
new file mode 100644
index 0000000..64967d6
--- /dev/null
+++ b/graph/logging_utils.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+
+def configure_file_logger(
+    name: str,
+    *,
+    log_file: Path,
+    level: str = "INFO",
+    enabled: bool = True,
+) -> logging.Logger:
+    logger = logging.getLogger(name)
+    logger.setLevel(getattr(logging, (level or "INFO").upper(), logging.INFO))
+    logger.propagate = False
+
+    handler_key = "appl_kgraph_managed"
+    for handler in list(logger.handlers):
+        if getattr(handler, handler_key, False):
+            logger.removeHandler(handler)
+            handler.close()
+
+    if enabled:
+        log_file.parent.mkdir(parents=True, exist_ok=True)
+        handler = logging.FileHandler(log_file, encoding="utf-8")
+        handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
+        setattr(handler, handler_key, True)
+        logger.addHandler(handler)
+
+    return logger
diff --git a/graph/main.py b/graph/main.py
index 802d666..1e86348 100644
--- a/graph/main.py
+++ b/graph/main.py
@@ -1,87 +1,92 @@
-import asyncio
-from pathlib import Path
-from typing import Optional
-# local imports
-from ingestion import ingest_paths
-from pathrag import PathRAG, render_full_context
-from fileparser import FileParser
-from lightrag import RetrievalResult
-
-
-async def ask_with_pathrag(question: str, verbose: bool = False, conversation_history=None) -> None:
-    """
-    Asks a question using PathRAG retrieval and prints the answer with context.
-
-    Args:
-        question (str): The question to ask.
-        verbose (bool, optional): If True, displays full context details. Defaults to False.
-        conversation_history (optional): List of (role, text) tuples for conversation history.
-
-    Returns:
-        None
-    """
-    rag = PathRAG(
-        system_prompt=""
-    )
-    result = await rag.aretrieve(question, conversation_history=conversation_history)
-    print("Answer:\n", result.answer)
-
-    print(render_full_context(result) if verbose else "")
-    if not verbose:
-        for window in result.context_windows:
-            print(f"\n[{window.label}] score={window.score:.2f}\n{window.text}")
-
-
-async def ask_with_lightrag(question: str, verbose: Optional[bool] = False, history: Optional[list] = None) -> RetrievalResult:
-    """
-    Asks a question using LightRAG retrieval and prints the answer with context.
-
-    Args:
-        question (str): The question to ask.
-        verbose (bool, optional): If True, displays full context details. Defaults to False.
-        history (list, optional): Conversation history. Defaults to None.
-
-    Returns:
-        Result object containing the answer and context.
-    """
-    from lightrag import LightRAG
-    from lightrag import render_full_context
-    rag = LightRAG(
-        system_prompt=""
-    )
-    result = await rag.aretrieve(question, conversation_history=history)
-    print("Answer:\n", result.answer)
-
-    print(render_full_context(result) if verbose else "")
-    
-    return result
-
-
-def main():
-    """
-    Main entry point for document ingestion and Q&A demonstration.
-
-    Ingests documents from the 'docs' directory and runs a sample PathRAG query.
-    """
-    root = Path('docs')
-    paths = FileParser(root).filepaths
-    if not paths:
-        print("No files to ingest.")
-        return
-    ingest_paths(paths)
-    # query = "Who are the authors of LayoutParser and do they overlap any of the other articles?"
-    query = input("Enter your question: ")
-    conversation_history = []  # List[Tuple[str, str]] with role in {"user", "assistant"}
-    while query not in ("exit", "quit"):
-        print("\n--- PathRAG Response ---\n")
-        asyncio.run(ask_with_pathrag(query, verbose=True, conversation_history=conversation_history))
-        print("\n---\n")
-        print("\n--- LightRAG Response ---\n")
-        result = asyncio.run(ask_with_lightrag(query, verbose=True, history=conversation_history))
-        conversation_history.append(("user", query))
-        conversation_history.append(("assistant", result.answer))
-        print("\n---\n")
-        query = input("Enter your next question: ")
-
-if __name__ == "__main__":
-    main()
+from __future__ import annotations
+
+import argparse
+import asyncio
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+from ingestion import ingest_paths
+from lightrag import LightRAG, RetrievalResult
+from pathrag import PathRAG, render_full_context
+from project_paths import list_document_paths, resolve_project_paths
+
+
+async def ask_with_pathrag(
+    question: str,
+    *,
+    documents_root: Path,
+    verbose: bool = False,
+    conversation_history: Optional[List[Tuple[str, str]]] = None,
+) -> None:
+    project_paths = resolve_project_paths(documents_root)
+    rag = PathRAG(project_paths=project_paths, system_prompt="")
+    result = await rag.aretrieve(question, conversation_history=conversation_history)
+    print("Answer:\n", result.answer)
+    if verbose:
+        print(render_full_context(result))
+    elif result.context_windows:
+        for window in result.context_windows:
+            print(f"\n[{window.label}] score={window.score:.2f}\n{window.text}")
+
+
+async def ask_with_lightrag(
+    question: str,
+    *,
+    documents_root: Path,
+    verbose: bool = False,
+    history: Optional[List[Tuple[str, str]]] = None,
+) -> RetrievalResult:
+    project_paths = resolve_project_paths(documents_root)
+    rag = LightRAG(project_paths=project_paths, system_prompt="")
+    result = await rag.aretrieve(question, conversation_history=history)
+    print("Answer:\n", result.answer)
+    if verbose:
+        from lightrag import render_full_context as render_lightrag_context
+
+        print(render_lightrag_context(result))
+    return result
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Ingest a document folder and query the project-scoped RAG stores.")
+    parser.add_argument("documents_root", nargs="?", default="docs", help="Folder containing documents to ingest and query.")
+    args = parser.parse_args()
+
+    documents_root = Path(args.documents_root).expanduser().resolve()
+    paths = list_document_paths(documents_root)
+    if not paths:
+        print("No files to ingest.")
+        return
+
+    ingest_paths(paths, documents_root=documents_root)
+
+    conversation_history: List[Tuple[str, str]] = []
+    query = input("Enter your question: ")
+    while query not in ("exit", "quit"):
+        print("\n--- PathRAG Response ---\n")
+        asyncio.run(
+            ask_with_pathrag(
+                query,
+                documents_root=documents_root,
+                verbose=True,
+                conversation_history=conversation_history,
+            )
+        )
+        print("\n---\n")
+        print("\n--- LightRAG Response ---\n")
+        result = asyncio.run(
+            ask_with_lightrag(
+                query,
+                documents_root=documents_root,
+                verbose=True,
+                history=conversation_history,
+            )
+        )
+        conversation_history.append(("user", query))
+        conversation_history.append(("assistant", result.answer))
+        print("\n---\n")
+        query = input("Enter your next question: ")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/graph/pathrag.py b/graph/pathrag.py
index 675667b..b960027 100644
--- a/graph/pathrag.py
+++ b/graph/pathrag.py
@@ -1,25 +1,30 @@
 """Single-file PathRAG retriever integrated with the ingestion storage backend."""
 from __future__ import annotations
 
-import asyncio
-import logging
-import json
-from dataclasses import dataclass
-from functools import lru_cache
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
-from collections import defaultdict
-
-import networkx as nx
-import tiktoken
-
-from settings import settings
-from db_storage import Storage
-from db_storage import StoragePaths
-from llm import Chat
-from prompts import PROMPTS
-
-
-LOGGER = logging.getLogger("PathRAG")
+import asyncio
+import logging
+import json
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
+from collections import defaultdict
+from pathlib import Path
+
+import networkx as nx
+import tiktoken
+
+from settings import settings
+from db_storage import Storage
+from db_storage import StoragePaths
+from llm import Chat
+from prompts import PROMPTS
+from logging_utils import configure_file_logger
+from graph_pickle import load_or_build_graph_snapshot
+from project_paths import ProjectPaths
+from query_logging import write_query_log
+
+
+LOGGER = logging.getLogger("PathRAG")
 
 # ---------------------------------------------------------------------------
 # Verbosity helper
@@ -82,14 +87,13 @@ def render_full_context(result: RetrievalResult) -> str:
 # Logging and token helpers
 # ---------------------------------------------------------------------------
 
-def set_logger(log_file: str) -> None:
-    """Configure the package wide logger."""
-
-    LOGGER.setLevel(logging.INFO)
-    handler = logging.FileHandler(log_file)
-    handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
-    if not LOGGER.handlers:
-        LOGGER.addHandler(handler)
+def set_logger(log_file: Path) -> None:
+    configure_file_logger(
+        "PathRAG",
+        log_file=log_file,
+        level=settings.logging.retrieval_level,
+        enabled=settings.logging.retrieval_enabled,
+    )
 
 
 @lru_cache(maxsize=4)
@@ -373,14 +377,20 @@ class GraphSnapshot:
 # ---------------------------------------------------------------------------
 
 
-class StorageAdapter:
-    """High level helper around the ingestion ``Storage`` facade."""
-
-    def __init__(self, paths: Optional[StoragePaths] = None):
-        self._storage = Storage(paths=paths)
-        # Ensure tables exist before we start querying them.
-        self._storage.init()
-        self._graph_snapshot: Optional[GraphSnapshot] = None
+class StorageAdapter:
+    """High level helper around the ingestion ``Storage`` facade."""
+
+    def __init__(
+        self,
+        paths: Optional[StoragePaths] = None,
+        *,
+        graph_pickle_path: Optional[Path] = None,
+    ):
+        self._storage = Storage(paths=paths)
+        # Ensure tables exist before we start querying them.
+        self._storage.init()
+        self._graph_snapshot: Optional[GraphSnapshot] = None
+        self._graph_pickle_path = graph_pickle_path
 
     # ------------------------------------------------------------------
     # Graph helpers
@@ -396,55 +406,13 @@ def refresh_graph(self) -> None:
 
         self._graph_snapshot = self._load_graph()
 
-    def _load_graph(self) -> GraphSnapshot:
-        graph = nx.Graph()
-
-        with self._storage.graphdb.connect() as con:
-            node_rows = con.execute(
-                "SELECT name, type, description, source_id, filepath FROM nodes;"
-            ).fetchall()
-            edge_rows = con.execute(
-                "SELECT source_name, target_name, weight, description, keywords, "
-                "source_id, filepath FROM edges;"
-            ).fetchall()
-
-        for name, type_, description, source_id, filepath in node_rows:
-            node_id = (name or "").strip()
-            if not node_id:
-                continue
-            graph.add_node(
-                node_id,
-                type=(type_ or "unknown").strip() or "unknown",
-                description=(description or "").strip(),
-                source_id=(source_id or "").strip(),
-                filepath=(filepath or "").strip(),
-            )
-
-        for row in edge_rows:
-            source, target, weight, description, keywords, source_id, filepath = row
-            src_id = (source or "").strip()
-            tgt_id = (target or "").strip()
-            if not src_id or not tgt_id:
-                continue
-            if src_id not in graph or tgt_id not in graph:
-                LOGGER.debug("Skipping edge with missing endpoints: %s -> %s", src_id, tgt_id)
-                continue
-            graph.add_edge(
-                src_id,
-                tgt_id,
-                weight=float(weight) if weight is not None else 1.0,
-                description=(description or "").strip(),
-                keywords=(keywords or "").strip(),
-                source_id=(source_id or "").strip(),
-                filepath=(filepath or "").strip(),
-            )
-
-        LOGGER.debug(
-            "Loaded graph snapshot with %d nodes and %d edges",
-            graph.number_of_nodes(),
-            graph.number_of_edges(),
-        )
-        return GraphSnapshot(graph=graph)
+    def _load_graph(self) -> GraphSnapshot:
+        graph = load_or_build_graph_snapshot(
+            self._storage,
+            snapshot_path=self._graph_pickle_path,
+            logger=LOGGER,
+        )
+        return GraphSnapshot(graph=graph)
 
     def get_node(self, name: str) -> Optional[Dict[str, Any]]:
         node_name = name.strip()
@@ -476,95 +444,64 @@ def _as_list(value: Any) -> List[Any]:
             return list(value)
         return [value]
 
-    def query_entities(self, text: str, limit: int = 5) -> List[EntityMatch]:
-        """Query entity vector index and return EntityMatch objects."""
-        results = self._storage.entity_vectors.query(text=text, n_results=limit) or []
-        matches: List[EntityMatch] = []
-        if not results:
-            return matches
-
-        # Chroma returns a single dict with lists
-        ids = results[0].get("ids", [])
-        metadatas = results[0].get("metadatas", [])
-        distances = results[0].get("distances", [])
-
-        for i, metadata in enumerate(metadatas):
-            if not isinstance(metadata, dict):
-                continue
-            entity_id = ids[i] if i < len(ids) else ""
-            distance = distances[i] if i < len(distances) else None
-            matches.append(
-                EntityMatch(
-                    name=metadata.get("name", entity_id),
-                    type=metadata.get("type"),
-                    description=metadata.get("description", ""),
-                    score=_distance_to_similarity(distance),
-                )
-            )
-        return matches
-
-    def query_relations(self, text: str, limit: int = 5) -> List[RelationMatch]:
-        """Query relation vector index and return RelationMatch objects."""
-        results = self._storage.relation_vectors.query(text=text, n_results=limit) or []
-        matches: List[RelationMatch] = []
-        if not results:
-            return matches
-
-        ids = results[0].get("ids", [])
-        metadatas = results[0].get("metadatas", [])
-        distances = results[0].get("distances", [])
-
-        for i, metadata in enumerate(metadatas):
-            if not isinstance(metadata, dict):
-                continue
-            distance = distances[i] if i < len(distances) else None
-            matches.append(
-                RelationMatch(
-                    source_name=metadata.get("source_name", ""),
-                    target_name=metadata.get("target_name", ""),
-                    description=metadata.get("description", ""),
-                    keywords=metadata.get("keywords", ""),
-                    score=_distance_to_similarity(distance),
-                )
-            )
-        return matches
-
-
-    def query_chunks(self, text: str, limit: int = 5) -> List[ChunkMatch]:
-        results = self._storage.chunk_vectors.query(text=text, n_results=limit) or []
-        matches: List[ChunkMatch] = []
-        for result in results:
-            metadatas = self._as_list(result.get("metadatas"))
-            ids = self._as_list(result.get("ids"))
-            distances = self._as_list(result.get("distances"))
-            documents = self._as_list(result.get("documents"))
-            max_len = max(
-                (
-                    len(seq)
-                    for seq in (metadatas, ids, distances, documents)
-                    if seq
-                ),
-                default=0,
-            )
-            for index in range(max_len):
-                metadata = metadatas[index] if index < len(metadatas) else {}
-                if not isinstance(metadata, dict):
-                    metadata = {}
-                chunk_id = ids[index] if index < len(ids) else ""
-                distance = distances[index] if index < len(distances) else None
-                document = documents[index] if index < len(documents) else ""
-                if not isinstance(document, str):
-                    document = str(document or "")
-                matches.append(
-                    ChunkMatch(
-                        chunk_uuid=str(chunk_id),
-                        document_id=str(metadata.get("doc_id", "")),
-                        filename=str(metadata.get("filename", "")),
-                        text=document,
-                        score=_distance_to_similarity(distance),
-                    )
-                )
-        return matches
+    def query_entities(self, text: str, limit: int = 5) -> List[EntityMatch]:
+        """Query entity vector index and return EntityMatch objects."""
+        results = self._storage.search_entities(text=text, n_results=limit) or []
+        matches: List[EntityMatch] = []
+        if not results:
+            return matches
+
+        for metadata in results:
+            if not isinstance(metadata, dict):
+                continue
+            matches.append(
+                EntityMatch(
+                    name=metadata.get("name", ""),
+                    type=metadata.get("type"),
+                    description=metadata.get("description", ""),
+                    score=float(metadata.get("score", 0.0) or 0.0),
+                )
+            )
+        return matches
+
+    def query_relations(self, text: str, limit: int = 5) -> List[RelationMatch]:
+        """Query relation vector index and return RelationMatch objects."""
+        results = self._storage.search_relations(text=text, n_results=limit) or []
+        matches: List[RelationMatch] = []
+        if not results:
+            return matches
+
+        for metadata in results:
+            if not isinstance(metadata, dict):
+                continue
+            matches.append(
+                RelationMatch(
+                    source_name=metadata.get("source_name", ""),
+                    target_name=metadata.get("target_name", ""),
+                    description=metadata.get("description", ""),
+                    keywords=metadata.get("keywords", ""),
+                    score=float(metadata.get("score", 0.0) or 0.0),
+                )
+            )
+        return matches
+
+
+    def query_chunks(self, text: str, limit: int = 5) -> List[ChunkMatch]:
+        results = self._storage.search_chunks(text=text, n_results=limit) or []
+        matches: List[ChunkMatch] = []
+        for metadata in results:
+            if not isinstance(metadata, dict):
+                continue
+            matches.append(
+                ChunkMatch(
+                    chunk_uuid=str(metadata.get("chunk_uuid", "")),
+                    document_id=str(metadata.get("doc_id", "")),
+                    filename=str(metadata.get("filename", "")),
+                    text=str(metadata.get("text", "")),
+                    score=float(metadata.get("score", 0.0) or 0.0),
+                )
+            )
+        return matches
 
     # ------------------------------------------------------------------
     # Convenience helpers
@@ -772,10 +709,29 @@ def build_context_for_prompt(
 # ---------------------------------------------------------------------------
 
 # !! lightweight placeholder for testing, replace with imported PROMPT when finished.
-RAG_PROMPT = (
+RAG_PROMPT = (
     "You are a helpful assistant. Use the supplied context to answer the question."
     "\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer in Markdown."
-)
+)
+
+
+def _retrieval_model_metadata() -> Dict[str, Any]:
+    provider = settings.provider.provider
+    if provider == "azure":
+        return {
+            "provider": provider,
+            "model_name": settings.provider.azure_llm_deployment,
+            "model_version": settings.provider.azure_api_version,
+            "api_version": settings.provider.azure_api_version,
+            "endpoint": settings.provider.azure_endpoint,
+        }
+    return {
+        "provider": provider,
+        "model_name": settings.provider.openai_llm_model,
+        "model_version": settings.provider.openai_llm_model,
+        "api_version": "",
+        "endpoint": settings.provider.openai_base_url or "https://api.openai.com/v1",
+    }
 
 
 # ---------------------------------------------------------------------------
@@ -790,17 +746,40 @@ class PathRAG:
         result = rag.retrieve("What is the capital of France?")
     """
 
-    def __init__(
-        self,
-        *,
-        storage_paths: Optional[StoragePaths] = None,
-        system_prompt: Optional[str] = None,
-        log_file: str = "PathRAG.log",
-    ) -> None:
-        set_logger(log_file)
-        LOGGER.info("Initialising PathRAG retriever")
-        self._storage = StorageAdapter(paths=storage_paths)
-        self._chat = RetrieveChat(system_prompt=system_prompt)
+    def __init__(
+        self,
+        *,
+        storage_paths: Optional[StoragePaths] = None,
+        project_paths: Optional[ProjectPaths] = None,
+        system_prompt: Optional[str] = None,
+        log_file: Optional[str] = None,
+    ) -> None:
+        self._project_paths = project_paths
+        effective_storage_paths = (
+            storage_paths
+            if storage_paths is not None
+            else (project_paths.storage if project_paths is not None else None)
+        )
+        effective_log_file = (
+            Path(log_file)
+            if log_file is not None
+            else (
+                project_paths.pathrag_log_file
+                if project_paths is not None
+                else Path("PathRAG.log")
+            )
+        )
+        set_logger(effective_log_file)
+        LOGGER.info("Initialising PathRAG retriever")
+        self._storage = StorageAdapter(
+            paths=effective_storage_paths,
+            graph_pickle_path=(
+                project_paths.retrieval_graph_pickle_file
+                if project_paths is not None
+                else None
+            ),
+        )
+        self._chat = RetrieveChat(system_prompt=system_prompt)
 
     # ------------------------------------------------------------------
     # Retrieval entry points
@@ -930,18 +909,74 @@ def build_local_windows(
             user_prompt=question,
         )
 
-        answer = await self._chat.generate(
-            prompt,
-            max_tokens=settings.retrieval.llm_max_tokens,
-            temperature=settings.retrieval.llm_temperature,
-        )
-        return RetrievalResult(
-            answer=answer,
-            context_windows=context_windows,
-            entity_matches=entity_matches,
-            relation_matches=relation_matches,
-            chunk_matches=chunk_matches,
-        )
+        answer = await self._chat.generate(
+            prompt,
+            max_tokens=settings.retrieval.llm_max_tokens,
+            temperature=settings.retrieval.llm_temperature,
+        )
+        result = RetrievalResult(
+            answer=answer,
+            context_windows=context_windows,
+            entity_matches=entity_matches,
+            relation_matches=relation_matches,
+            chunk_matches=chunk_matches,
+        )
+        if settings.logging.qa_enabled:
+            write_query_log(
+                project_paths=self._project_paths,
+                retriever_name="pathrag",
+                payload={
+                    "question": question,
+                    "answer": answer,
+                    "active_documents_root": str(self._project_paths.documents_root)
+                    if self._project_paths
+                    else None,
+                    "conversation_history": conversation_history or [],
+                    "retrieval_metadata": {
+                        "response_type": settings.retrieval.response_type,
+                        "entity_top_k": settings.retrieval.entity_top_k,
+                        "relation_top_k": settings.retrieval.relation_top_k,
+                        "chunk_top_k": settings.retrieval.chunk_top_k,
+                        "global_window_count": len(global_windows),
+                        "local_window_count": len(local_windows),
+                    },
+                    "model": _retrieval_model_metadata(),
+                    "context_windows": [
+                        {"label": window.label, "text": window.text, "score": window.score}
+                        for window in context_windows
+                    ],
+                    "retrieved_entities": [
+                        {
+                            "name": match.name,
+                            "type": match.type,
+                            "description": match.description,
+                            "score": match.score,
+                        }
+                        for match in entity_matches
+                    ],
+                    "retrieved_relationships": [
+                        {
+                            "source_name": match.source_name,
+                            "target_name": match.target_name,
+                            "description": match.description,
+                            "keywords": match.keywords,
+                            "score": match.score,
+                        }
+                        for match in relation_matches
+                    ],
+                    "retrieved_chunks": [
+                        {
+                            "chunk_uuid": match.chunk_uuid,
+                            "document_id": match.document_id,
+                            "filename": match.filename,
+                            "text": match.text,
+                            "score": match.score,
+                        }
+                        for match in chunk_matches
+                    ],
+                },
+            )
+        return result
 
     def retrieve(
         self,
@@ -969,4 +1004,4 @@ def retrieve(
     "ChunkMatch",
     "ContextWindow",
     "RetrievalResult",
-]
\ No newline at end of file
+]
diff --git a/graph/project_paths.py b/graph/project_paths.py
new file mode 100644
index 0000000..b051e0b
--- /dev/null
+++ b/graph/project_paths.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, List, Optional, Union
+
+from settings import VALID_EXTENSIONS, StoragePaths, settings
+
+
+@dataclass(frozen=True)
+class ProjectPaths:
+    documents_root: Path
+    project_root: Path
+    storage_root: Path
+    knowledge_graph_dir: Path
+    graph_pickle_file: Path
+    retrieval_graph_pickle_file: Path
+    logs_dir: Path
+    qa_logs_dir: Path
+    audits_dir: Path
+    extraction_audits_dir: Path
+    storage: StoragePaths
+    ingestion_log_file: Path
+    pathrag_log_file: Path
+    lightrag_log_file: Path
+
+
+def resolve_project_paths(documents_root: Union[Path, str]) -> ProjectPaths:
+    root = Path(documents_root).expanduser().resolve()
+    project_root = root / settings.project.artifacts_dirname
+    storage_root = project_root / settings.project.storage_dirname
+    knowledge_graph_dir = project_root / "knowledge_graph"
+    logs_dir = project_root / settings.project.logs_dirname
+    qa_logs_dir = logs_dir / settings.project.qa_logs_dirname
+    audits_dir = project_root / settings.project.audits_dirname
+    extraction_audits_dir = audits_dir / settings.project.extraction_audits_dirname
+
+    storage = StoragePaths(
+        documents_db=str(storage_root / "documents.sqlite"),
+        chunks_db=str(storage_root / "chunks.sqlite"),
+        graph_db=str(storage_root / "graph.sqlite"),
+        chroma_chunks=str(storage_root / "chroma_chunks"),
+        chroma_entities=str(storage_root / "chroma_entities"),
+        chroma_relations=str(storage_root / "chroma_relations"),
+    )
+
+    return ProjectPaths(
+        documents_root=root,
+        project_root=project_root,
+        storage_root=storage_root,
+        knowledge_graph_dir=knowledge_graph_dir,
+        graph_pickle_file=knowledge_graph_dir / "kg.pkl",
+        retrieval_graph_pickle_file=knowledge_graph_dir / "kg_retrieval.pkl",
+        logs_dir=logs_dir,
+        qa_logs_dir=qa_logs_dir,
+        audits_dir=audits_dir,
+        extraction_audits_dir=extraction_audits_dir,
+        storage=storage,
+        ingestion_log_file=logs_dir / "ingestion.log",
+        pathrag_log_file=logs_dir / "pathrag.log",
+        lightrag_log_file=logs_dir / "lightrag.log",
+    )
+
+
+def ensure_project_dirs(project_paths: ProjectPaths) -> None:
+    for path in (
+        project_paths.project_root,
+        project_paths.storage_root,
+        project_paths.knowledge_graph_dir,
+        project_paths.logs_dir,
+        project_paths.qa_logs_dir,
+        project_paths.audits_dir,
+        project_paths.extraction_audits_dir,
+    ):
+        path.mkdir(parents=True, exist_ok=True)
+
+
+def list_document_paths(
+    documents_root: Union[Path, str],
+    *,
+    valid_extensions: Optional[Iterable[str]] = None,
+) -> List[Path]:
+    root = Path(documents_root).expanduser().resolve()
+    if not root.exists() or not root.is_dir():
+        return []
+
+    allowed = {ext.lower() for ext in (valid_extensions or VALID_EXTENSIONS)}
+    project_root = root / settings.project.artifacts_dirname
+    paths: List[Path] = []
+
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        if project_root in path.parents:
+            continue
+        if path.suffix.lower() not in allowed:
+            continue
+        paths.append(path)
+
+    return sorted(paths, key=lambda item: str(item).lower())
diff --git a/graph/prompts.py b/graph/prompts.py
index 1dc06d0..94c049b 100644
--- a/graph/prompts.py
+++ b/graph/prompts.py
@@ -211,6 +211,51 @@
 Answer ONLY by `YES` OR `NO` if there are still entities that need to be added.
 """.strip()
 
+PROMPTS["entity_extraction_audit"] = """---Role---
+
+You are auditing an information extraction result for completeness.
+
+---Goal---
+
+Review the source text and the initial extraction. Report possible missing entities or relationships, but do not rewrite or replace the original extraction.
+Use {language} for any natural-language explanations.
+
+---Instructions---
+
+- Focus only on entities of these types: [{entity_types}]
+- Review the initial extraction carefully before reporting gaps.
+- Only report candidates that appear to be missing from the original extraction.
+- If nothing important appears missing, return empty arrays.
+- Output valid JSON only.
+
+---Source Text---
+{input_text}
+
+---Initial Extraction---
+{initial_extraction}
+
+---Required JSON Schema---
+{{
+  "missing_entities": [
+    {{
+      "name": "entity name",
+      "type": "entity type",
+      "reason": "why this looks missing"
+    }}
+  ],
+  "missing_relationships": [
+    {{
+      "source_name": "source entity",
+      "target_name": "target entity",
+      "reason": "why this relationship looks missing"
+    }}
+  ],
+  "summary": "short audit summary"
+}}
+
+JSON:
+"""
+
 PROMPTS["fail_response"] = (
     "Sorry, I'm not able to provide an answer to that question.[no-context]"
 )
@@ -383,6 +428,9 @@
 
 Response:"""
 
+PROMPTS["lightrag_response"] = PROMPTS["rag_response"]
+PROMPTS["rag_response_naive"] = PROMPTS["naive_rag_response"]
+
 PROMPTS["summarize_text"] = """---Role---
 You are a helpful assistant responsible for generating a concise summary of the data provided below.
 Given a text document, please provide a concise summary that captures the main points and key information.
@@ -426,4 +474,4 @@
     hint_prompt = entity_extract_prompt.format(
             **{**context_base, "input_text": content}
         )
-    print(hint_prompt)
\ No newline at end of file
+    print(hint_prompt)
diff --git a/graph/query_logging.py b/graph/query_logging.py
new file mode 100644
index 0000000..d0975ad
--- /dev/null
+++ b/graph/query_logging.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, Optional
+from uuid import uuid4
+
+from project_paths import ProjectPaths, ensure_project_dirs
+
+
+def _json_safe(value: Any) -> Any:
+    if value is None or isinstance(value, (str, int, float, bool)):
+        return value
+    if isinstance(value, dict):
+        return {str(k): _json_safe(v) for k, v in value.items()}
+    if isinstance(value, (list, tuple, set)):
+        return [_json_safe(v) for v in value]
+    return str(value)
+
+
+def write_query_log(
+    *,
+    project_paths: Optional[ProjectPaths],
+    retriever_name: str,
+    payload: Dict[str, Any],
+) -> Optional[Path]:
+    if project_paths is None:
+        return None
+
+    ensure_project_dirs(project_paths)
+    timestamp = datetime.now(timezone.utc)
+    filename = f"{timestamp.strftime('%Y%m%dT%H%M%S.%fZ')}_{retriever_name}_{uuid4().hex[:8]}.json"
+    target = project_paths.qa_logs_dir / filename
+
+    body = {
+        "retriever": retriever_name,
+        "timestamp": timestamp.isoformat(),
+        **_json_safe(payload),
+    }
+    target.write_text(json.dumps(body, ensure_ascii=False, indent=2), encoding="utf-8")
+    return target
diff --git a/graph/settings.py b/graph/settings.py
index 4d8637a..aefb83c 100644
--- a/graph/settings.py
+++ b/graph/settings.py
@@ -1,330 +1,361 @@
-# graph/settings.py
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import List, Optional, Literal
-from dotenv import load_dotenv
-import utils as ut
-
-
-VALID_EXTENSIONS: List[str] = [".pdf", ".docx", ".txt", ".md", ".html"]
-
-# ─────────────────────────────────────────────────────────────
-# Settings sections
-# ─────────────────────────────────────────────────────────────
-
-@dataclass(frozen=True)
-class ProviderSettings:
-    """
-    Provider selection + credentials.
-    Choose provider via: LLM_PROVIDER = "openai" or "azure"
-    """
-    provider: Literal["openai", "azure"] = "openai"
-
-    # OpenAI (direct)
-    openai_api_key: Optional[str] = None
-    openai_base_url: Optional[str] = None         # optional (for proxies / compatible servers)
-    openai_llm_model: Optional[str] = None        # e.g. gpt-4o-mini
-    openai_embeddings_model: Optional[str] = None # e.g. text-embedding-3-small
-
-    # Azure OpenAI
-    azure_api_key: Optional[str] = None
-    azure_endpoint: Optional[str] = None
-    azure_api_version: str = "2024-02-15-preview"
-    azure_llm_deployment: Optional[str] = None
-    azure_embeddings_deployment: Optional[str] = None
-
-
-@dataclass(frozen=True)
-class ChatGenerationSettings:
-    """
-    Default knobs for chat completions used across the app.
-    """
-    temperature: float = 0.0
-    max_tokens: int = 2048
-
-@dataclass(frozen=True)
-class LLMPerformanceSettings:
-    """
-    Performance-related knobs for LLM calls.
-    """
-    max_concurrency: int = 6              # num of parallel requests
-    cache_enabled: bool = True
-    cache_max_age_hours: int = 720        # 30 days
-
-@dataclass(frozen=True)
-class EmbeddingSettings:
-    """
-    Embeddings model + batch behavior for vectorization.
-    (Model names come from ProviderSettings; this holds cross-cutting knobs.)
-    """
-    batch_size: int = 64
-
-@dataclass(frozen=True)
-class PromptFormattingSettings:
-    """
-    Formatting conventions your prompts expect & rely on.
-    Used by extractor prompts and downstream parsers.
-    """
-    default_language: str = "English"
-    tuple_delimiter: str = "<|>"
-    record_delimiter: str = "##"
-    completion_delimiter: str = "<|COMPLETE|>"
-    default_entity_types: List[str] = None  # filled in loader
-
-@dataclass(frozen=True)
-class IngestionMergeSettings:
-    """
-    How we concatenate multi-source fields before optional summarization.
-    delimiter: separates repeated descriptions/keywords/source_ids/filepaths
-    description_segment_limit: threshold after which we summarize with LLM
-    """
-    delimiter: str = "||"
-    description_segment_limit: int = 5
-
-@dataclass(frozen=True)
-class ChunkingSettings:
-    """
-    Default chunking policy for page-aware, sentence-preserving chunker.
-    """
-    max_chars: int = 1200
-    overlap_chars: int = 200
-    include_overlap_in_limit: bool = True
-    join_with: str = " "
-
-@dataclass(frozen=True)
-class StoragePaths:
-    """
-    Where we store SQLite DBs and Chroma collections.
-    """
-    documents_db: str = "./storage/documents.sqlite"
-    chunks_db: str = "./storage/chunks.sqlite"
-    graph_db: str = "./storage/graph.sqlite"
-
-    chroma_chunks: str = "./storage/chroma_chunks"
-    chroma_entities: str = "./storage/chroma_entities"
-    chroma_relations: str = "./storage/chroma_relations"
-
-@dataclass(frozen=True)
-class RetrievalSettings:
-    """
-    Settings for retrieval operations (e.g. how many results to return).
-    """
-    entity_top_k: int = 5
-    relation_top_k: int = 5
-    chunk_top_k: int = 6
-    graph_depth: int = 2
-    graph_windows: int = 3
-    chunk_windows: int = 3
-    graph_window_tokens: int = 512
-    chunk_window_tokens: int = 512
-    tiktoken_model: str = "gpt-4o-mini"  # for token counting
-    llm_max_tokens: int = 512
-    llm_temperature: float = 0.0
-    history_turns: int = 4
-
-    # --- Hybrid/global (i.e. do we use paths & relations?) ---
-    hybrid_use_paths_for_global: bool = True # whether to build global context from paths
-    hybrid_use_relations_for_global: bool = False   # whether to build global context from relations
-    global_max_windows: int = 4 # max global context windows
-    global_window_tokens: int = 512 # token cap per global window
-
-    # --- Local toggles (i.e. do we use chunks and local neighborhoods?) ---
-    use_local_chunks: bool = True # whether to use local chunks
-    use_local_graph: bool = True # whether to use local graph
-    local_max_windows: int = 6 # cap total local windows
-
-    # --- PathRAG-specific retrieval settings ---
-    path_use_top_entities: int = 5   # limit the number of entity seeds considered
-    path_max_depth: int = 3          # search up to 3 hops
-    path_threshold: float = 0.3      # propagation threshold
-    path_alpha: float = 0.8          # propagation decay
-    path_max_windows: int = 5        # how many path windows to emit
-    path_window_tokens: int = 512    # tokens per path window
-
-    # --- LightRAG-specific retrieval settings ---
-    light_mode: str = "mix"       # 'local', 'global', 'hybrid', 'mix', 'naive'
-    response_type: str = "Single Paragraph" #'Multiple Paragraphs', 'Single Paragraph'
-    rerank_top_k: int = 20
-    enable_rerank: bool = True
-    rerank_cache_dir: str = "./flashrank_model"  # directory for FlashRank model cache
-    rerank_model_name: str = "ms-marco-MultiBERT-L-12"  # model name for reranking
-    truncate_chunks: bool = False  # whether to truncate chunks by token limit
-
-
-@dataclass(frozen=True)
-class Settings:
-    """
-    Full application settings bundle.
-    """
-    provider: ProviderSettings
-    chat: ChatGenerationSettings
-    llmperf: LLMPerformanceSettings
-    embeddings: EmbeddingSettings
-    prompts: PromptFormattingSettings
-    ingestion: IngestionMergeSettings
-    chunking: ChunkingSettings
-    storage: StoragePaths
-    retrieval: RetrievalSettings
-
-
-
-# ─────────────────────────────────────────────────────────────
-# Loader / validator
-# ─────────────────────────────────────────────────────────────
-
-def load_settings() -> Settings:
-    """
-    Loads and validates all application settings from environment variables.
-
-    Reads from .env file, parses configuration for provider, LLM, embeddings, chunking,
-    storage paths, and retrieval settings. Validates required fields based on provider.
-
-    Returns:
-        Settings: A fully configured Settings object.
-
-    Raises:
-        RuntimeError: If required environment variables are missing or invalid.
-    """
-    load_dotenv()  # called once at startup
-
-    # Provider selection
-    provider_name = (ut.env_str("LLM_PROVIDER", "openai") or "openai").strip().lower()
-    if provider_name not in {"openai", "azure"}:
-        raise RuntimeError("LLM_PROVIDER must be 'openai' or 'azure'.")
-
-    provider = ProviderSettings(
-        provider=provider_name,  # type: ignore[arg-type]
-        # OpenAI
-        openai_api_key=ut.env_str("OPENAI_API_KEY"),
-        openai_base_url=ut.env_str("OPENAI_BASE_URL"),
-        openai_llm_model=ut.env_str("OPENAI_LLM_MODEL"),
-        openai_embeddings_model=ut.env_str("OPENAI_EMBEDDINGS_MODEL"),
-        # Azure
-        azure_api_key=ut.env_str("AZURE_OPENAI_API_KEY"),
-        azure_endpoint=ut.env_str("AZURE_OPENAI_ENDPOINT"),
-        azure_api_version=ut.env_str("AZURE_OPENAI_API_VERSION", "2024-02-15-preview") or "2024-02-15-preview",
-        azure_llm_deployment=ut.env_str("AZURE_OPENAI_LLM_DEPLOYMENT_NAME"),
-        azure_embeddings_deployment=ut.env_str("AZURE_OPENAI_EMB_DEPLOYMENT_NAME"),
-    )
-
-    # Validate provider-specific required fields
-    if provider.provider == "openai":
-        if not provider.openai_api_key:
-            raise RuntimeError("OPENAI_API_KEY is required when LLM_PROVIDER=openai.")
-        if not provider.openai_llm_model:
-            raise RuntimeError("OPENAI_LLM_MODEL is required when LLM_PROVIDER=openai.")
-        if not provider.openai_embeddings_model:
-            raise RuntimeError("OPENAI_EMBEDDINGS_MODEL is required when LLM_PROVIDER=openai.")
-    else:
-        # azure
-        missing = []
-        if not provider.azure_api_key: missing.append("AZURE_OPENAI_API_KEY")
-        if not provider.azure_endpoint: missing.append("AZURE_OPENAI_ENDPOINT")
-        if not provider.azure_llm_deployment: missing.append("AZURE_OPENAI_LLM_DEPLOYMENT_NAME")
-        if not provider.azure_embeddings_deployment: missing.append("AZURE_OPENAI_EMB_DEPLOYMENT_NAME")
-        if missing:
-            raise RuntimeError(f"When LLM_PROVIDER=azure, set required variables: {', '.join(missing)}")
-
-    chat = ChatGenerationSettings(
-        temperature=ut.env_float("CHAT_TEMPERATURE", 0.0),
-        max_tokens=ut.env_int("CHAT_MAX_TOKENS", 2048),
-    )
-
-    llmperf = LLMPerformanceSettings(
-        max_concurrency=ut.env_int("LLM_MAX_CONCURRENCY", 6),
-        cache_enabled=ut.env_bool("LLM_CACHE_ENABLED", True),
-        cache_max_age_hours=ut.env_int("LLM_CACHE_MAX_AGE_HOURS", 720),
-    )
-
-    embeddings = EmbeddingSettings(
-        batch_size=ut.env_int("EMBEDDING_BATCH_SIZE", 64),
-    )
-
-    prompts = PromptFormattingSettings(
-        default_language=ut.env_str("PROMPT_DEFAULT_LANGUAGE", "English") or "English",
-        tuple_delimiter=ut.env_str("PROMPT_TUPLE_DELIMITER", "<|>") or "<|>",
-        record_delimiter=ut.env_str("PROMPT_RECORD_DELIMITER", "##") or "##",
-        completion_delimiter=ut.env_str("PROMPT_COMPLETION_DELIMITER", "<|COMPLETE|>") or "<|COMPLETE|>",
-        default_entity_types=ut.env_list(
-            "PROMPT_DEFAULT_ENTITY_TYPES",
-            "organization,person,geo,event,category"
-        ),
-    )
-
-    ingestion = IngestionMergeSettings(
-        delimiter=ut.env_str("MERGE_DELIMITER", "||") or "||",
-        description_segment_limit=ut.env_int("DESCRIPTION_SEGMENT_LIMIT", 5),
-    )
-
-    chunking = ChunkingSettings(
-        max_chars=ut.env_int("CHUNK_MAX_CHARS", 1200),
-        overlap_chars=ut.env_int("CHUNK_OVERLAP_CHARS", 200),
-        include_overlap_in_limit=ut.env_bool("CHUNK_INCLUDE_OVERLAP_IN_LIMIT", True),
-        join_with=ut.env_str("CHUNK_JOIN_WITH", " ") or " ",
-    )
-
-    storage = StoragePaths(
-        documents_db=ut.env_str("DOCUMENTS_DB_PATH", "./storage/documents.sqlite") or "./storage/documents.sqlite",
-        chunks_db=ut.env_str("CHUNKS_DB_PATH", "./storage/chunks.sqlite") or "./storage/chunks.sqlite",
-        graph_db=ut.env_str("GRAPH_DB_PATH", "./storage/graph.sqlite") or "./storage/graph.sqlite",
-        chroma_chunks=ut.env_str("CHROMA_CHUNKS_PATH", "./storage/chroma_chunks") or "./storage/chroma_chunks",
-        chroma_entities=ut.env_str("CHROMA_ENTITIES_PATH", "./storage/chroma_entities") or "./storage/chroma_entities",
-        chroma_relations=ut.env_str("CHROMA_RELATIONS_PATH", "./storage/chroma_relations") or "./storage/chroma_relations",
-    )
-
-    retrieval = RetrievalSettings(
-        entity_top_k=ut.env_int("RETRIEVAL_ENTITY_TOP_K", 5),
-        relation_top_k=ut.env_int("RETRIEVAL_RELATION_TOP_K", 5),
-        chunk_top_k=ut.env_int("RETRIEVAL_CHUNK_TOP_K", 6),
-        graph_depth=ut.env_int("RETRIEVAL_GRAPH_DEPTH", 2),
-        graph_windows=ut.env_int("RETRIEVAL_GRAPH_WINDOWS", 3),
-        chunk_windows=ut.env_int("RETRIEVAL_CHUNK_WINDOWS", 3),
-        graph_window_tokens=ut.env_int("RETRIEVAL_GRAPH_WINDOW_TOKENS", 512),
-        chunk_window_tokens=ut.env_int("RETRIEVAL_CHUNK_WINDOW_TOKENS", 512),
-        tiktoken_model = ut.env_str("RETRIEVAL_TIKTOKEN_MODEL", "gpt-4o-mini") or "gpt-4o-mini",
-        llm_max_tokens = ut.env_int("RETRIEVAL_LLM_MAX_TOKENS", 512),
-        llm_temperature = ut.env_float("RETRIEVAL_LLM_TEMPERATURE", 0.0),
-        history_turns= ut.env_int("RETRIEVAL_HISTORY_TURNS", 4),
-        # Hybrid/global
-        hybrid_use_paths_for_global = ut.env_bool("RETRIEVAL_HYBRID_USE_PATHS_FOR_GLOBAL", True),
-        hybrid_use_relations_for_global = ut.env_bool("RETRIEVAL_HYBRID_USE_RELATIONS_FOR_GLOBAL", False),
-        global_max_windows = ut.env_int("RETRIEVAL_GLOBAL_MAX_WINDOWS", 4),
-        global_window_tokens = ut.env_int("RETRIEVAL_GLOBAL_WINDOW_TOKENS", 512),
-        # Local
-        use_local_chunks = ut.env_bool("RETRIEVAL_USE_LOCAL_CHUNKS", True),
-        use_local_graph = ut.env_bool("RETRIEVAL_USE_LOCAL_GRAPH", True),
-        local_max_windows = ut.env_int("RETRIEVAL_LOCAL_MAX_WINDOWS", 6),
-        # PathRAG-specific
-        path_use_top_entities = ut.env_int("RETRIEVAL_PATH_USE_TOP_ENTITIES", 5),
-        path_max_depth = ut.env_int("RETRIEVAL_PATH_MAX_DEPTH", 3),
-        path_threshold = ut.env_float("RETRIEVAL_PATH_THRESHOLD", 0.3),
-        path_alpha = ut.env_float("RETRIEVAL_PATH_ALPHA", 0.8),
-        path_max_windows = ut.env_int("RETRIEVAL_PATH_MAX_WINDOWS", 5),
-        path_window_tokens = ut.env_int("RETRIEVAL_PATH_WINDOW_TOKENS", 512),
-        # LightRAG-specific
-        light_mode = ut.env_str("RETRIEVAL_LIGHT_MODE", "mix"),
-        response_type = ut.env_str("RETRIEVAL_RESPONSE_TYPE", "Single Paragraphs"),
-        enable_rerank = ut.env_bool("RETRIEVAL_ENABLE_RERANK", True),
-        rerank_top_k = ut.env_int("RETRIEVAL_RERANK_TOP_K", 20),
-        rerank_cache_dir = ut.env_str("RETRIEVAL_RERANK_CACHE_DIR", "./flashrank_model"),
-        rerank_model_name = ut.env_str("RETRIEVAL_RERANK_MODEL_NAME", "ms-marco-MultiBERT-L-12"),
-        truncate_chunks = ut.env_bool("RETRIEVAL_TRUNCATE_CHUNKS", False),
-    )
-
-    return Settings(
-        provider=provider,
-        chat=chat,
-        llmperf=llmperf,
-        embeddings=embeddings,
-        prompts=prompts,
-        ingestion=ingestion,
-        chunking=chunking,
-        storage=storage,
-        retrieval=retrieval,
-    )
-
-
-# Optional: convenience singleton (you can prefer dependency injection instead)
-settings = load_settings()
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import List, Literal, Optional
+
+from dotenv import load_dotenv
+
+import utils as ut
+
+
+VALID_EXTENSIONS: List[str] = [".pdf", ".docx", ".txt", ".md", ".html"]
+
+
+@dataclass(frozen=True)
+class ProviderSettings:
+    provider: Literal["openai", "azure"] = "openai"
+    openai_api_key: Optional[str] = None
+    openai_base_url: Optional[str] = None
+    openai_llm_model: Optional[str] = None
+    openai_embeddings_model: Optional[str] = None
+    azure_api_key: Optional[str] = None
+    azure_endpoint: Optional[str] = None
+    azure_api_version: str = "2024-02-15-preview"
+    azure_llm_deployment: Optional[str] = None
+    azure_embeddings_deployment: Optional[str] = None
+
+
+@dataclass(frozen=True)
+class ChatGenerationSettings:
+    temperature: float = 0.0
+    completion_max_tokens: int = 2048
+
+    @property
+    def max_tokens(self) -> int:
+        return self.completion_max_tokens
+
+
+@dataclass(frozen=True)
+class LLMPerformanceSettings:
+    max_concurrency: int = 6
+    cache_enabled: bool = True
+    cache_max_age_hours: int = 720
+
+
+@dataclass(frozen=True)
+class EmbeddingSettings:
+    batch_size: int = 64
+
+
+@dataclass(frozen=True)
+class PromptFormattingSettings:
+    default_language: str = "English"
+    tuple_delimiter: str = "<|>"
+    record_delimiter: str = "##"
+    completion_delimiter: str = "<|COMPLETE|>"
+    default_entity_types: List[str] = None  # type: ignore[assignment]
+    default_user_prompt: str = "n/a"
+
+
+@dataclass(frozen=True)
+class IngestionMergeSettings:
+    delimiter: str = "||"
+    description_segment_limit: int = 5
+
+
+@dataclass(frozen=True)
+class ChunkingSettings:
+    max_chars: int = 1200
+    overlap_chars: int = 200
+    include_overlap_in_limit: bool = True
+    join_with: str = " "
+
+
+@dataclass(frozen=True)
+class StoragePaths:
+    documents_db: str = "./storage/documents.sqlite"
+    chunks_db: str = "./storage/chunks.sqlite"
+    graph_db: str = "./storage/graph.sqlite"
+    chroma_chunks: str = "./storage/chroma_chunks"
+    chroma_entities: str = "./storage/chroma_entities"
+    chroma_relations: str = "./storage/chroma_relations"
+
+
+@dataclass(frozen=True)
+class ProjectSettings:
+    artifacts_dirname: str = ".appl-kgraph"
+    storage_dirname: str = "storage"
+    logs_dirname: str = "logs"
+    qa_logs_dirname: str = "qa"
+    audits_dirname: str = "audits"
+    extraction_audits_dirname: str = "extraction"
+
+
+@dataclass(frozen=True)
+class ExtractionSettings:
+    use_chunk_language: bool = True
+    detect_chunk_language: bool = False
+    audit_second_pass_enabled: bool = False
+
+
+@dataclass(frozen=True)
+class LoggingSettings:
+    ingestion_enabled: bool = True
+    ingestion_level: str = "INFO"
+    retrieval_enabled: bool = True
+    retrieval_level: str = "INFO"
+    qa_enabled: bool = True
+
+
+@dataclass(frozen=True)
+class RetrievalSettings:
+    entity_top_k: int = 5
+    relation_top_k: int = 5
+    chunk_top_k: int = 6
+    graph_depth: int = 2
+    graph_windows: int = 3
+    chunk_windows: int = 3
+    graph_window_tokens: int = 512
+    chunk_window_tokens: int = 512
+    tiktoken_model: str = "gpt-4o-mini"
+    answer_max_tokens: int = 512
+    llm_temperature: float = 0.0
+    history_turns: int = 4
+    hybrid_use_paths_for_global: bool = True
+    hybrid_use_relations_for_global: bool = False
+    global_max_windows: int = 4
+    global_window_tokens: int = 512
+    use_local_chunks: bool = True
+    use_local_graph: bool = True
+    local_max_windows: int = 6
+    path_use_top_entities: int = 5
+    path_max_depth: int = 3
+    path_threshold: float = 0.3
+    path_alpha: float = 0.8
+    path_max_windows: int = 5
+    path_window_tokens: int = 512
+    light_mode: str = "mix"
+    response_type: str = "Single Paragraph"
+    rerank_top_k: int = 20
+    enable_rerank: bool = True
+    rerank_cache_dir: str = "./flashrank_model"
+    rerank_model_name: str = "ms-marco-MultiBERT-L-12"
+    truncate_chunks: bool = False
+
+    @property
+    def llm_max_tokens(self) -> int:
+        return self.answer_max_tokens
+
+
+@dataclass(frozen=True)
+class Settings:
+    provider: ProviderSettings
+    chat: ChatGenerationSettings
+    llmperf: LLMPerformanceSettings
+    embeddings: EmbeddingSettings
+    prompts: PromptFormattingSettings
+    ingestion: IngestionMergeSettings
+    chunking: ChunkingSettings
+    storage: StoragePaths
+    project: ProjectSettings
+    extraction: ExtractionSettings
+    logging: LoggingSettings
+    retrieval: RetrievalSettings
+
+
+def load_settings() -> Settings:
+    load_dotenv()
+
+    provider_name = (ut.env_str("LLM_PROVIDER", "openai") or "openai").strip().lower()
+    if provider_name not in {"openai", "azure"}:
+        raise RuntimeError("LLM_PROVIDER must be 'openai' or 'azure'.")
+
+    provider = ProviderSettings(
+        provider=provider_name,  # type: ignore[arg-type]
+        openai_api_key=ut.env_str("OPENAI_API_KEY"),
+        openai_base_url=ut.env_str("OPENAI_BASE_URL"),
+        openai_llm_model=ut.env_str("OPENAI_LLM_MODEL"),
+        openai_embeddings_model=ut.env_str("OPENAI_EMBEDDINGS_MODEL"),
+        azure_api_key=ut.env_str("AZURE_OPENAI_API_KEY"),
+        azure_endpoint=ut.env_str("AZURE_OPENAI_ENDPOINT"),
+        azure_api_version=ut.env_str("AZURE_OPENAI_API_VERSION", "2024-02-15-preview")
+        or "2024-02-15-preview",
+        azure_llm_deployment=ut.env_str("AZURE_OPENAI_LLM_DEPLOYMENT_NAME"),
+        azure_embeddings_deployment=ut.env_str("AZURE_OPENAI_EMB_DEPLOYMENT_NAME"),
+    )
+
+    if provider.provider == "openai":
+        if not provider.openai_api_key:
+            raise RuntimeError("OPENAI_API_KEY is required when LLM_PROVIDER=openai.")
+        if not provider.openai_llm_model:
+            raise RuntimeError("OPENAI_LLM_MODEL is required when LLM_PROVIDER=openai.")
+        if not provider.openai_embeddings_model:
+            raise RuntimeError("OPENAI_EMBEDDINGS_MODEL is required when LLM_PROVIDER=openai.")
+    else:
+        missing = []
+        if not provider.azure_api_key:
+            missing.append("AZURE_OPENAI_API_KEY")
+        if not provider.azure_endpoint:
+            missing.append("AZURE_OPENAI_ENDPOINT")
+        if not provider.azure_llm_deployment:
+            missing.append("AZURE_OPENAI_LLM_DEPLOYMENT_NAME")
+        if not provider.azure_embeddings_deployment:
+            missing.append("AZURE_OPENAI_EMB_DEPLOYMENT_NAME")
+        if missing:
+            raise RuntimeError(
+                f"When LLM_PROVIDER=azure, set required variables: {', '.join(missing)}"
+            )
+
+    chat = ChatGenerationSettings(
+        temperature=ut.env_float("CHAT_TEMPERATURE", 0.0),
+        completion_max_tokens=ut.env_int("CHAT_MAX_TOKENS", 2048),
+    )
+
+    llmperf = LLMPerformanceSettings(
+        max_concurrency=ut.env_int("LLM_MAX_CONCURRENCY", 6),
+        cache_enabled=ut.env_bool("LLM_CACHE_ENABLED", True),
+        cache_max_age_hours=ut.env_int("LLM_CACHE_MAX_AGE_HOURS", 720),
+    )
+
+    embeddings = EmbeddingSettings(
+        batch_size=ut.env_int("EMBEDDING_BATCH_SIZE", 64),
+    )
+
+    prompts = PromptFormattingSettings(
+        default_language=ut.env_str("PROMPT_DEFAULT_LANGUAGE", "English") or "English",
+        tuple_delimiter=ut.env_str("PROMPT_TUPLE_DELIMITER", "<|>") or "<|>",
+        record_delimiter=ut.env_str("PROMPT_RECORD_DELIMITER", "##") or "##",
+        completion_delimiter=ut.env_str("PROMPT_COMPLETION_DELIMITER", "<|COMPLETE|>")
+        or "<|COMPLETE|>",
+        default_entity_types=ut.env_list(
+            "PROMPT_DEFAULT_ENTITY_TYPES",
+            "organization,person,geo,event,category",
+        ),
+        default_user_prompt=ut.env_str("PROMPT_DEFAULT_USER_PROMPT", "n/a") or "n/a",
+    )
+
+    ingestion = IngestionMergeSettings(
+        delimiter=ut.env_str("MERGE_DELIMITER", "||") or "||",
+        description_segment_limit=ut.env_int("DESCRIPTION_SEGMENT_LIMIT", 5),
+    )
+
+    chunking = ChunkingSettings(
+        max_chars=ut.env_int("CHUNK_MAX_CHARS", 1200),
+        overlap_chars=ut.env_int("CHUNK_OVERLAP_CHARS", 200),
+        include_overlap_in_limit=ut.env_bool("CHUNK_INCLUDE_OVERLAP_IN_LIMIT", True),
+        join_with=ut.env_str("CHUNK_JOIN_WITH", " ") or " ",
+    )
+
+    storage = StoragePaths(
+        documents_db=ut.env_str("DOCUMENTS_DB_PATH", "./storage/documents.sqlite")
+        or "./storage/documents.sqlite",
+        chunks_db=ut.env_str("CHUNKS_DB_PATH", "./storage/chunks.sqlite")
+        or "./storage/chunks.sqlite",
+        graph_db=ut.env_str("GRAPH_DB_PATH", "./storage/graph.sqlite")
+        or "./storage/graph.sqlite",
+        chroma_chunks=ut.env_str("CHROMA_CHUNKS_PATH", "./storage/chroma_chunks")
+        or "./storage/chroma_chunks",
+        chroma_entities=ut.env_str("CHROMA_ENTITIES_PATH", "./storage/chroma_entities")
+        or "./storage/chroma_entities",
+        chroma_relations=ut.env_str("CHROMA_RELATIONS_PATH", "./storage/chroma_relations")
+        or "./storage/chroma_relations",
+    )
+
+    project = ProjectSettings(
+        artifacts_dirname=ut.env_str("PROJECT_ARTIFACTS_DIRNAME", ".appl-kgraph")
+        or ".appl-kgraph",
+        storage_dirname=ut.env_str("PROJECT_STORAGE_DIRNAME", "storage") or "storage",
+        logs_dirname=ut.env_str("PROJECT_LOGS_DIRNAME", "logs") or "logs",
+        qa_logs_dirname=ut.env_str("PROJECT_QA_LOGS_DIRNAME", "qa") or "qa",
+        audits_dirname=ut.env_str("PROJECT_AUDITS_DIRNAME", "audits") or "audits",
+        extraction_audits_dirname=ut.env_str(
+            "PROJECT_EXTRACTION_AUDITS_DIRNAME", "extraction"
+        )
+        or "extraction",
+    )
+
+    extraction = ExtractionSettings(
+        use_chunk_language=ut.env_bool("EXTRACTION_USE_CHUNK_LANGUAGE", True),
+        detect_chunk_language=ut.env_bool("EXTRACTION_DETECT_CHUNK_LANGUAGE", False),
+        audit_second_pass_enabled=ut.env_bool(
+            "EXTRACTION_AUDIT_SECOND_PASS_ENABLED",
+            False,
+        ),
+    )
+
+    logging_settings = LoggingSettings(
+        ingestion_enabled=ut.env_bool("INGESTION_LOG_ENABLED", True),
+        ingestion_level=ut.env_str("INGESTION_LOG_LEVEL", "INFO") or "INFO",
+        retrieval_enabled=ut.env_bool("RETRIEVAL_LOG_ENABLED", True),
+        retrieval_level=ut.env_str("RETRIEVAL_LOG_LEVEL", "INFO") or "INFO",
+        qa_enabled=ut.env_bool("QA_LOG_ENABLED", True),
+    )
+
+    retrieval = RetrievalSettings(
+        entity_top_k=ut.env_int("RETRIEVAL_ENTITY_TOP_K", 5),
+        relation_top_k=ut.env_int("RETRIEVAL_RELATION_TOP_K", 5),
+        chunk_top_k=ut.env_int("RETRIEVAL_CHUNK_TOP_K", 6),
+        graph_depth=ut.env_int("RETRIEVAL_GRAPH_DEPTH", 2),
+        graph_windows=ut.env_int("RETRIEVAL_GRAPH_WINDOWS", 3),
+        chunk_windows=ut.env_int("RETRIEVAL_CHUNK_WINDOWS", 3),
+        graph_window_tokens=ut.env_int("RETRIEVAL_GRAPH_WINDOW_TOKENS", 512),
+        chunk_window_tokens=ut.env_int("RETRIEVAL_CHUNK_WINDOW_TOKENS", 512),
+        tiktoken_model=ut.env_str("RETRIEVAL_TIKTOKEN_MODEL", "gpt-4o-mini")
+        or "gpt-4o-mini",
+        answer_max_tokens=ut.env_int("RETRIEVAL_LLM_MAX_TOKENS", 512),
+        llm_temperature=ut.env_float("RETRIEVAL_LLM_TEMPERATURE", 0.0),
+        history_turns=ut.env_int("RETRIEVAL_HISTORY_TURNS", 4),
+        hybrid_use_paths_for_global=ut.env_bool(
+            "RETRIEVAL_HYBRID_USE_PATHS_FOR_GLOBAL",
+            True,
+        ),
+        hybrid_use_relations_for_global=ut.env_bool(
+            "RETRIEVAL_HYBRID_USE_RELATIONS_FOR_GLOBAL",
+            False,
+        ),
+        global_max_windows=ut.env_int("RETRIEVAL_GLOBAL_MAX_WINDOWS", 4),
+        global_window_tokens=ut.env_int("RETRIEVAL_GLOBAL_WINDOW_TOKENS", 512),
+        use_local_chunks=ut.env_bool("RETRIEVAL_USE_LOCAL_CHUNKS", True),
+        use_local_graph=ut.env_bool("RETRIEVAL_USE_LOCAL_GRAPH", True),
+        local_max_windows=ut.env_int("RETRIEVAL_LOCAL_MAX_WINDOWS", 6),
+        path_use_top_entities=ut.env_int("RETRIEVAL_PATH_USE_TOP_ENTITIES", 5),
+        path_max_depth=ut.env_int("RETRIEVAL_PATH_MAX_DEPTH", 3),
+        path_threshold=ut.env_float("RETRIEVAL_PATH_THRESHOLD", 0.3),
+        path_alpha=ut.env_float("RETRIEVAL_PATH_ALPHA", 0.8),
+        path_max_windows=ut.env_int("RETRIEVAL_PATH_MAX_WINDOWS", 5),
+        path_window_tokens=ut.env_int("RETRIEVAL_PATH_WINDOW_TOKENS", 512),
+        light_mode=ut.env_str("RETRIEVAL_LIGHT_MODE", "mix") or "mix",
+        response_type=ut.env_str("RETRIEVAL_RESPONSE_TYPE", "Single Paragraph")
+        or "Single Paragraph",
+        enable_rerank=ut.env_bool("RETRIEVAL_ENABLE_RERANK", True),
+        rerank_top_k=ut.env_int("RETRIEVAL_RERANK_TOP_K", 20),
+        rerank_cache_dir=ut.env_str(
+            "RETRIEVAL_RERANK_CACHE_DIR",
+            "./flashrank_model",
+        )
+        or "./flashrank_model",
+        rerank_model_name=ut.env_str(
+            "RETRIEVAL_RERANK_MODEL_NAME",
+            "ms-marco-MultiBERT-L-12",
+        )
+        or "ms-marco-MultiBERT-L-12",
+        truncate_chunks=ut.env_bool("RETRIEVAL_TRUNCATE_CHUNKS", False),
+    )
+
+    return Settings(
+        provider=provider,
+        chat=chat,
+        llmperf=llmperf,
+        embeddings=embeddings,
+        prompts=prompts,
+        ingestion=ingestion,
+        chunking=chunking,
+        storage=storage,
+        project=project,
+        extraction=extraction,
+        logging=logging_settings,
+        retrieval=retrieval,
+    )
+
+
+settings = load_settings()
diff --git a/graph/utils.py b/graph/utils.py
index 0e519a1..5810969 100644
--- a/graph/utils.py
+++ b/graph/utils.py
@@ -1,80 +1,81 @@
+from __future__ import annotations
+
 import os
-from typing import List, Optional, Literal
-from langdetect import detect, LangDetectException
+from typing import List, Optional
 
-def detect_language(text: str, num_chars: int = 1000) -> str:
-    """
-    Detects the language of a text based on a sample of its characters.
+from langdetect import LangDetectException, detect
 
-    Args:
-        text (str): The input text to analyze for language detection.
-        num_chars (int, optional): The number of characters from the beginning
-            of the text to use for detection. Defaults to 1000.
 
-    Returns:
-        str: A language code (e.g., 'en' for English, 'fr' for French) or 'unknown'
-            if the language cannot be detected or if the text is empty.
+LANGUAGE_NAME_MAP = {
+    "ar": "Arabic",
+    "de": "German",
+    "en": "English",
+    "es": "Spanish",
+    "fr": "French",
+    "it": "Italian",
+    "nl": "Dutch",
+    "pt": "Portuguese",
+    "zh": "Chinese",
+    "zh-cn": "Chinese",
+    "zh-tw": "Chinese Traditional",
+}
+
+
+def detect_language(text: str, num_chars: int = 1000) -> str:
+    """
+    Detect a language code from the leading portion of a text.
     """
     text_snippet = text[:num_chars] if len(text) > num_chars else text
 
     if not text_snippet.strip():
-        # Handle the case where the text snippet is empty or only contains whitespace
-        return 'unknown'
+        return "unknown"
+
     try:
         return detect(text_snippet)
-    except LangDetectException as e:
-        if 'No features in text' in str(e):
-            # Handle the specific error where no features are found in the text
-            return 'unknown'
-    # Default return statement to ensure the function always returns a value
-    return 'unknown'
+    except LangDetectException as exc:
+        if "No features in text" in str(exc):
+            return "unknown"
+    return "unknown"
 
-# ─────────────────────────────────────────────────────────────
-# Small helpers to parse environment variables robustly
-# ─────────────────────────────────────────────────────────────
 
-def _strip_quotes(val: Optional[str]) -> Optional[str]:
+def normalize_language_name(language: Optional[str], default: str = "English") -> str:
+    """
+    Convert a language code or free-form language string into a prompt-friendly name.
     """
-    Removes surrounding quotes from environment variable values.
+    if not language:
+        return default
 
-    Args:
-        val (Optional[str]): The value to process.
+    candidate = str(language).strip()
+    if not candidate:
+        return default
 
-    Returns:
-        Optional[str]: The value with quotes stripped, or None if input was None.
-    """
+    lowered = candidate.lower()
+    if lowered == "unknown":
+        return default
+    if lowered in LANGUAGE_NAME_MAP:
+        return LANGUAGE_NAME_MAP[lowered]
+    if len(candidate) <= 3 and candidate.islower():
+        return default
+    return candidate[:1].upper() + candidate[1:]
+
+
+def _strip_quotes(val: Optional[str]) -> Optional[str]:
     if val is None:
         return None
-    v = val.strip()
-    if (v.startswith('"') and v.endswith('"')) or (v.startswith("'") and v.endswith("'")):
-        return v[1:-1]
-    return v
-
-def env_str(key: str, default: Optional[str] = None) -> Optional[str]:
-    """
-    Reads a string environment variable with quote stripping.
+    stripped = val.strip()
+    if (stripped.startswith('"') and stripped.endswith('"')) or (
+        stripped.startswith("'") and stripped.endswith("'")
+    ):
+        return stripped[1:-1]
+    return stripped
 
-    Args:
-        key (str): The environment variable name.
-        default (Optional[str], optional): Default value if not found. Defaults to None.
 
-    Returns:
-        Optional[str]: The environment variable value or default.
-    """
+def env_str(key: str, default: Optional[str] = None) -> Optional[str]:
     val = os.getenv(key)
     return _strip_quotes(val) if val is not None else default
 
-def env_int(key: str, default: int) -> int:
-    """
-    Reads an integer environment variable with fallback to default.
 
-    Args:
-        key (str): The environment variable name.
-        default (int): Default value if not found or invalid.
-
-    Returns:
-        int: The parsed integer value or default.
-    """
+def env_int(key: str, default: int) -> int:
     val = env_str(key)
     if val is None or val == "":
         return default
diff --git a/test/test_chunker.py b/test/test_chunker.py
new file mode 100644
index 0000000..c4601fe
--- /dev/null
+++ b/test/test_chunker.py
@@ -0,0 +1,43 @@
+import os
+import sys
+from pathlib import Path
+
+
+os.environ.setdefault("OPENAI_API_KEY", "test-key")
+os.environ.setdefault("OPENAI_LLM_MODEL", "test-model")
+os.environ.setdefault("OPENAI_EMBEDDINGS_MODEL", "test-embed")
+
+sys.path.append(str(Path(__file__).resolve().parent.parent / "graph"))
+
+from graph.chunker import chunk_text
+
+
+def test_chunk_overlap_does_not_cascade_previous_overlap():
+    text = "A1. B2. C3. D4. E5. F6."
+    chunks = chunk_text(
+        text,
+        max_chars=11,
+        overlap_chars=7,
+        include_overlap_in_limit=True,
+    )
+
+    assert [chunk["text"] for chunk in chunks] == [
+        "A1. B2. C3.",
+        "B2. C3. D4.",
+        "D4. E5. F6.",
+    ]
+
+
+def test_chunker_keeps_oversized_sentence_intact():
+    text = "This sentence is deliberately much longer than the configured chunk size."
+    chunks = chunk_text(
+        text,
+        max_chars=10,
+        overlap_chars=0,
+        include_overlap_in_limit=True,
+    )
+
+    assert len(chunks) == 1
+    assert chunks[0]["text"] == text
+    assert chunks[0]["char_count"] > 10
+    assert chunks[0]["exceeds_target"] is True
diff --git a/test/test_graph_pickle.py b/test/test_graph_pickle.py
new file mode 100644
index 0000000..97b1901
--- /dev/null
+++ b/test/test_graph_pickle.py
@@ -0,0 +1,114 @@
+import os
+import sys
+from pathlib import Path
+
+import networkx as nx
+
+
+os.environ.setdefault("OPENAI_API_KEY", "test-key")
+os.environ.setdefault("OPENAI_LLM_MODEL", "test-model")
+os.environ.setdefault("OPENAI_EMBEDDINGS_MODEL", "test-embed")
+
+sys.path.append(str(Path(__file__).resolve().parent.parent / "graph"))
+
+from graph.graph_pickle import (
+    load_graph_from_pickle,
+    load_or_build_graph_snapshot,
+    save_graph_to_pickle,
+)
+
+
+class _FakeCursor:
+    def __init__(self, rows):
+        self._rows = rows
+
+    def fetchall(self):
+        return self._rows
+
+
+class _FakeConnection:
+    def __init__(self, node_rows, edge_rows):
+        self._node_rows = node_rows
+        self._edge_rows = edge_rows
+
+    def execute(self, sql):
+        if "FROM nodes" in sql:
+            return _FakeCursor(self._node_rows)
+        if "FROM edges" in sql:
+            return _FakeCursor(self._edge_rows)
+        raise AssertionError(f"Unexpected query: {sql}")
+
+
+class _FakeGraphDB:
+    def __init__(self, node_rows, edge_rows):
+        self._node_rows = node_rows
+        self._edge_rows = edge_rows
+
+    def connect(self):
+        connection = _FakeConnection(self._node_rows, self._edge_rows)
+
+        class _Context:
+            def __enter__(self_inner):
+                return connection
+
+            def __exit__(self_inner, exc_type, exc, tb):
+                return False
+
+        return _Context()
+
+
+class _FakeStorage:
+    def __init__(self, node_rows, edge_rows):
+        self.graphdb = _FakeGraphDB(node_rows, edge_rows)
+
+
+def test_graph_pickle_round_trip(tmp_path):
+    graph = nx.Graph()
+    graph.add_node("A", type="person")
+    graph.add_edge("A", "B", weight=2.0)
+
+    target = tmp_path / "graph.pkl"
+    save_graph_to_pickle(graph, target)
+    loaded = load_graph_from_pickle(target)
+
+    assert isinstance(loaded, nx.Graph)
+    assert sorted(loaded.nodes()) == ["A", "B"]
+    assert loaded["A"]["B"]["weight"] == 2.0
+
+
+def test_load_or_build_graph_snapshot_rebuilds_and_saves_when_missing(tmp_path):
+    storage = _FakeStorage(
+        node_rows=[
+            ("Node A", "person", "Alpha", "chunk-1||chunk-2", "doc-a.txt"),
+            ("Node B", "organization", "Beta", "", "doc-b.txt"),
+        ],
+        edge_rows=[
+            ("Node A", "Node B", 1.5, "works with", "partnership", "chunk-1", "doc-a.txt"),
+        ],
+    )
+
+    target = tmp_path / "kg_retrieval.pkl"
+    graph = load_or_build_graph_snapshot(storage, snapshot_path=target)
+
+    assert target.exists()
+    assert sorted(graph.nodes()) == ["Node A", "Node B"]
+    assert graph.nodes["Node A"]["chunk_uuids"] == ["chunk-1", "chunk-2"]
+    assert graph["Node A"]["Node B"]["keywords"] == "partnership"
+
+
+def test_load_or_build_graph_snapshot_prefers_existing_pickle(tmp_path):
+    graph = nx.Graph()
+    graph.add_node("Saved Node", type="event")
+    target = tmp_path / "kg_retrieval.pkl"
+    save_graph_to_pickle(graph, target)
+
+    class _BrokenStorage:
+        class _GraphDB:
+            def connect(self):
+                raise AssertionError("Storage should not be consulted when snapshot exists")
+
+        graphdb = _GraphDB()
+
+    loaded = load_or_build_graph_snapshot(_BrokenStorage(), snapshot_path=target)
+
+    assert sorted(loaded.nodes()) == ["Saved Node"]
diff --git a/test/test_pathrag_storage_adapter.py b/test/test_pathrag_storage_adapter.py
index 365b116..aae2697 100644
--- a/test/test_pathrag_storage_adapter.py
+++ b/test/test_pathrag_storage_adapter.py
@@ -1,41 +1,15 @@
+import os
 import sys
-import types
+from pathlib import Path
 
 import pytest
 
 
-_storage_module = types.ModuleType("storage")
+os.environ.setdefault("OPENAI_API_KEY", "test-key")
+os.environ.setdefault("OPENAI_LLM_MODEL", "test-model")
+os.environ.setdefault("OPENAI_EMBEDDINGS_MODEL", "test-embed")
 
-
-class _ImportStubStorage:
-    def __init__(self, *args, **kwargs):
-        pass
-
-    def init(self):  # pragma: no cover - simple stub
-        return None
-
-
-class _ImportStubPaths:
-    pass
-
-
-_storage_module.Storage = _ImportStubStorage
-_storage_module.StoragePaths = _ImportStubPaths
-sys.modules.setdefault("storage", _storage_module)
-
-_llm_module = types.ModuleType("llm")
-
-
-class _ImportStubChat:
-    def __init__(self, *args, **kwargs):
-        pass
-
-    def generate(self, *args, **kwargs):  # pragma: no cover - simple stub
-        return ""
-
-
-_llm_module.Chat = _ImportStubChat
-sys.modules.setdefault("llm", _llm_module)
+sys.path.append(str(Path(__file__).resolve().parent.parent / "graph"))
 
 from graph.pathrag import StorageAdapter
 
@@ -54,7 +28,7 @@ def __init__(self, responses):
         self.relation_vectors = _FakeVector(responses["relations"])
         self.chunk_vectors = _FakeVector(responses["chunks"])
 
-    def init(self):  # pragma: no cover - simple stub
+    def init(self):
         return None
 
 
@@ -103,7 +77,7 @@ def fake_storage(monkeypatch):
         ],
     }
     storage = _FakeStorage(responses)
-    monkeypatch.setattr("graph.pathrag.IngestionStorage", lambda *args, **kwargs: storage)
+    monkeypatch.setattr("graph.pathrag.Storage", lambda *args, **kwargs: storage)
     return storage
 
 
@@ -111,14 +85,14 @@ def test_storage_adapter_query_helpers_expand_matches(fake_storage):
     adapter = StorageAdapter()
 
     entity_matches = adapter.query_entities("query", limit=5)
-    assert [m.name for m in entity_matches] == ["Entity One", "Entity Two"]
-    assert all(m.score > 0 for m in entity_matches)
+    assert [match.name for match in entity_matches] == ["Entity One", "Entity Two"]
+    assert all(match.score > 0 for match in entity_matches)
 
     relation_matches = adapter.query_relations("query", limit=5)
-    assert [m.source_name for m in relation_matches] == ["Entity One", "Entity Two"]
-    assert all(m.score > 0 for m in relation_matches)
+    assert [match.source_name for match in relation_matches] == ["Entity One", "Entity Two"]
+    assert all(match.score > 0 for match in relation_matches)
 
     chunk_matches = adapter.query_chunks("query", limit=5)
-    assert [m.chunk_uuid for m in chunk_matches] == ["chunk-1", "chunk-2"]
-    assert [m.text for m in chunk_matches] == ["text one", "text two"]
-    assert all(m.score > 0 for m in chunk_matches)
+    assert [match.chunk_uuid for match in chunk_matches] == ["chunk-1", "chunk-2"]
+    assert [match.text for match in chunk_matches] == ["text one", "text two"]
+    assert all(match.score > 0 for match in chunk_matches)
diff --git a/test/test_project_paths.py b/test/test_project_paths.py
new file mode 100644
index 0000000..447830b
--- /dev/null
+++ b/test/test_project_paths.py
@@ -0,0 +1,41 @@
+import os
+import sys
+from pathlib import Path
+
+
+os.environ.setdefault("OPENAI_API_KEY", "test-key")
+os.environ.setdefault("OPENAI_LLM_MODEL", "test-model")
+os.environ.setdefault("OPENAI_EMBEDDINGS_MODEL", "test-embed")
+
+sys.path.append(str(Path(__file__).resolve().parent.parent / "graph"))
+
+from graph.project_paths import list_document_paths, resolve_project_paths
+
+
+def test_resolve_project_paths_nests_artifacts_under_documents_root(tmp_path):
+    documents_root = tmp_path / "docs"
+    documents_root.mkdir()
+
+    project_paths = resolve_project_paths(documents_root)
+
+    assert project_paths.documents_root == documents_root.resolve()
+    assert project_paths.project_root == documents_root.resolve() / ".appl-kgraph"
+    assert Path(project_paths.storage.documents_db).parent == project_paths.storage_root
+    assert project_paths.qa_logs_dir.parent == project_paths.logs_dir
+    assert project_paths.graph_pickle_file == project_paths.knowledge_graph_dir / "kg.pkl"
+    assert project_paths.retrieval_graph_pickle_file == project_paths.knowledge_graph_dir / "kg_retrieval.pkl"
+
+
+def test_list_document_paths_excludes_project_artifacts(tmp_path):
+    documents_root = tmp_path / "docs"
+    documents_root.mkdir()
+    (documents_root / "a.txt").write_text("alpha", encoding="utf-8")
+    (documents_root / "b.md").write_text("beta", encoding="utf-8")
+
+    project_root = documents_root / ".appl-kgraph"
+    project_root.mkdir()
+    (project_root / "ignored.txt").write_text("ignore me", encoding="utf-8")
+
+    paths = list_document_paths(documents_root)
+
+    assert [path.name for path in paths] == ["a.txt", "b.md"]