diff --git a/.venv2/Include/site/python3.12/greenlet/greenlet.h b/.venv2/Include/site/python3.12/greenlet/greenlet.h new file mode 100644 index 0000000..d02a16e --- /dev/null +++ b/.venv2/Include/site/python3.12/greenlet/greenlet.h @@ -0,0 +1,164 @@ +/* -*- indent-tabs-mode: nil; tab-width: 4; -*- */ + +/* Greenlet object interface */ + +#ifndef Py_GREENLETOBJECT_H +#define Py_GREENLETOBJECT_H + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* This is deprecated and undocumented. It does not change. */ +#define GREENLET_VERSION "1.0.0" + +#ifndef GREENLET_MODULE +#define implementation_ptr_t void* +#endif + +typedef struct _greenlet { + PyObject_HEAD + PyObject* weakreflist; + PyObject* dict; + implementation_ptr_t pimpl; +} PyGreenlet; + +#define PyGreenlet_Check(op) (op && PyObject_TypeCheck(op, &PyGreenlet_Type)) + + +/* C API functions */ + +/* Total number of symbols that are exported */ +#define PyGreenlet_API_pointers 12 + +#define PyGreenlet_Type_NUM 0 +#define PyExc_GreenletError_NUM 1 +#define PyExc_GreenletExit_NUM 2 + +#define PyGreenlet_New_NUM 3 +#define PyGreenlet_GetCurrent_NUM 4 +#define PyGreenlet_Throw_NUM 5 +#define PyGreenlet_Switch_NUM 6 +#define PyGreenlet_SetParent_NUM 7 + +#define PyGreenlet_MAIN_NUM 8 +#define PyGreenlet_STARTED_NUM 9 +#define PyGreenlet_ACTIVE_NUM 10 +#define PyGreenlet_GET_PARENT_NUM 11 + +#ifndef GREENLET_MODULE +/* This section is used by modules that uses the greenlet C API */ +static void** _PyGreenlet_API = NULL; + +# define PyGreenlet_Type \ + (*(PyTypeObject*)_PyGreenlet_API[PyGreenlet_Type_NUM]) + +# define PyExc_GreenletError \ + ((PyObject*)_PyGreenlet_API[PyExc_GreenletError_NUM]) + +# define PyExc_GreenletExit \ + ((PyObject*)_PyGreenlet_API[PyExc_GreenletExit_NUM]) + +/* + * PyGreenlet_New(PyObject *args) + * + * greenlet.greenlet(run, parent=None) + */ +# define PyGreenlet_New \ + (*(PyGreenlet * (*)(PyObject * run, PyGreenlet * parent)) \ + _PyGreenlet_API[PyGreenlet_New_NUM]) + +/* + * PyGreenlet_GetCurrent(void) + * + * greenlet.getcurrent() + */ +# define PyGreenlet_GetCurrent \ + (*(PyGreenlet * (*)(void)) _PyGreenlet_API[PyGreenlet_GetCurrent_NUM]) + +/* + * PyGreenlet_Throw( + * PyGreenlet *greenlet, + * PyObject *typ, + * PyObject *val, + * PyObject *tb) + * + * g.throw(...) + */ +# define PyGreenlet_Throw \ + (*(PyObject * (*)(PyGreenlet * self, \ + PyObject * typ, \ + PyObject * val, \ + PyObject * tb)) \ + _PyGreenlet_API[PyGreenlet_Throw_NUM]) + +/* + * PyGreenlet_Switch(PyGreenlet *greenlet, PyObject *args) + * + * g.switch(*args, **kwargs) + */ +# define PyGreenlet_Switch \ + (*(PyObject * \ + (*)(PyGreenlet * greenlet, PyObject * args, PyObject * kwargs)) \ + _PyGreenlet_API[PyGreenlet_Switch_NUM]) + +/* + * PyGreenlet_SetParent(PyObject *greenlet, PyObject *new_parent) + * + * g.parent = new_parent + */ +# define PyGreenlet_SetParent \ + (*(int (*)(PyGreenlet * greenlet, PyGreenlet * nparent)) \ + _PyGreenlet_API[PyGreenlet_SetParent_NUM]) + +/* + * PyGreenlet_GetParent(PyObject* greenlet) + * + * return greenlet.parent; + * + * This could return NULL even if there is no exception active. + * If it does not return NULL, you are responsible for decrementing the + * reference count. + */ +# define PyGreenlet_GetParent \ + (*(PyGreenlet* (*)(PyGreenlet*)) \ + _PyGreenlet_API[PyGreenlet_GET_PARENT_NUM]) + +/* + * deprecated, undocumented alias. + */ +# define PyGreenlet_GET_PARENT PyGreenlet_GetParent + +# define PyGreenlet_MAIN \ + (*(int (*)(PyGreenlet*)) \ + _PyGreenlet_API[PyGreenlet_MAIN_NUM]) + +# define PyGreenlet_STARTED \ + (*(int (*)(PyGreenlet*)) \ + _PyGreenlet_API[PyGreenlet_STARTED_NUM]) + +# define PyGreenlet_ACTIVE \ + (*(int (*)(PyGreenlet*)) \ + _PyGreenlet_API[PyGreenlet_ACTIVE_NUM]) + + + + +/* Macro that imports greenlet and initializes C API */ +/* NOTE: This has actually moved to ``greenlet._greenlet._C_API``, but we + keep the older definition to be sure older code that might have a copy of + the header still works. */ +# define PyGreenlet_Import() \ + { \ + _PyGreenlet_API = (void**)PyCapsule_Import("greenlet._C_API", 0); \ + } + +#endif /* GREENLET_MODULE */ + +#ifdef __cplusplus +} +#endif +#endif /* !Py_GREENLETOBJECT_H */ diff --git a/.venv2/Scripts/Activate.ps1 b/.venv2/Scripts/Activate.ps1 new file mode 100644 index 0000000..b49d77b --- /dev/null +++ b/.venv2/Scripts/Activate.ps1 @@ -0,0 +1,247 @@ +<# +.Synopsis +Activate a Python virtual environment for the current PowerShell session. + +.Description +Pushes the python executable for a virtual environment to the front of the +$Env:PATH environment variable and sets the prompt to signify that you are +in a Python virtual environment. Makes use of the command line switches as +well as the `pyvenv.cfg` file values present in the virtual environment. + +.Parameter VenvDir +Path to the directory that contains the virtual environment to activate. The +default value for this is the parent of the directory that the Activate.ps1 +script is located within. + +.Parameter Prompt +The prompt prefix to display when this virtual environment is activated. By +default, this prompt is the name of the virtual environment folder (VenvDir) +surrounded by parentheses and followed by a single space (ie. '(.venv) '). + +.Example +Activate.ps1 +Activates the Python virtual environment that contains the Activate.ps1 script. + +.Example +Activate.ps1 -Verbose +Activates the Python virtual environment that contains the Activate.ps1 script, +and shows extra information about the activation as it executes. + +.Example +Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv +Activates the Python virtual environment located in the specified location. + +.Example +Activate.ps1 -Prompt "MyPython" +Activates the Python virtual environment that contains the Activate.ps1 script, +and prefixes the current prompt with the specified string (surrounded in +parentheses) while the virtual environment is active. + +.Notes +On Windows, it may be required to enable this Activate.ps1 script by setting the +execution policy for the user. You can do this by issuing the following PowerShell +command: + +PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser + +For more information on Execution Policies: +https://go.microsoft.com/fwlink/?LinkID=135170 + +#> +Param( + [Parameter(Mandatory = $false)] + [String] + $VenvDir, + [Parameter(Mandatory = $false)] + [String] + $Prompt +) + +<# Function declarations --------------------------------------------------- #> + +<# +.Synopsis +Remove all shell session elements added by the Activate script, including the +addition of the virtual environment's Python executable from the beginning of +the PATH variable. + +.Parameter NonDestructive +If present, do not remove this function from the global namespace for the +session. + +#> +function global:deactivate ([switch]$NonDestructive) { + # Revert to original values + + # The prior prompt: + if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) { + Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt + Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT + } + + # The prior PYTHONHOME: + if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) { + Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME + Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME + } + + # The prior PATH: + if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) { + Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH + Remove-Item -Path Env:_OLD_VIRTUAL_PATH + } + + # Just remove the VIRTUAL_ENV altogether: + if (Test-Path -Path Env:VIRTUAL_ENV) { + Remove-Item -Path env:VIRTUAL_ENV + } + + # Just remove VIRTUAL_ENV_PROMPT altogether. + if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) { + Remove-Item -Path env:VIRTUAL_ENV_PROMPT + } + + # Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether: + if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) { + Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force + } + + # Leave deactivate function in the global namespace if requested: + if (-not $NonDestructive) { + Remove-Item -Path function:deactivate + } +} + +<# +.Description +Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the +given folder, and returns them in a map. + +For each line in the pyvenv.cfg file, if that line can be parsed into exactly +two strings separated by `=` (with any amount of whitespace surrounding the =) +then it is considered a `key = value` line. The left hand string is the key, +the right hand is the value. + +If the value starts with a `'` or a `"` then the first and last character is +stripped from the value before being captured. + +.Parameter ConfigDir +Path to the directory that contains the `pyvenv.cfg` file. +#> +function Get-PyVenvConfig( + [String] + $ConfigDir +) { + Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg" + + # Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue). + $pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue + + # An empty map will be returned if no config file is found. + $pyvenvConfig = @{ } + + if ($pyvenvConfigPath) { + + Write-Verbose "File exists, parse `key = value` lines" + $pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath + + $pyvenvConfigContent | ForEach-Object { + $keyval = $PSItem -split "\s*=\s*", 2 + if ($keyval[0] -and $keyval[1]) { + $val = $keyval[1] + + # Remove extraneous quotations around a string value. + if ("'""".Contains($val.Substring(0, 1))) { + $val = $val.Substring(1, $val.Length - 2) + } + + $pyvenvConfig[$keyval[0]] = $val + Write-Verbose "Adding Key: '$($keyval[0])'='$val'" + } + } + } + return $pyvenvConfig +} + + +<# Begin Activate script --------------------------------------------------- #> + +# Determine the containing directory of this script +$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition +$VenvExecDir = Get-Item -Path $VenvExecPath + +Write-Verbose "Activation script is located in path: '$VenvExecPath'" +Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)" +Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)" + +# Set values required in priority: CmdLine, ConfigFile, Default +# First, get the location of the virtual environment, it might not be +# VenvExecDir if specified on the command line. +if ($VenvDir) { + Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values" +} +else { + Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir." + $VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/") + Write-Verbose "VenvDir=$VenvDir" +} + +# Next, read the `pyvenv.cfg` file to determine any required value such +# as `prompt`. +$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir + +# Next, set the prompt from the command line, or the config file, or +# just use the name of the virtual environment folder. +if ($Prompt) { + Write-Verbose "Prompt specified as argument, using '$Prompt'" +} +else { + Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value" + if ($pyvenvCfg -and $pyvenvCfg['prompt']) { + Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'" + $Prompt = $pyvenvCfg['prompt']; + } + else { + Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)" + Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'" + $Prompt = Split-Path -Path $venvDir -Leaf + } +} + +Write-Verbose "Prompt = '$Prompt'" +Write-Verbose "VenvDir='$VenvDir'" + +# Deactivate any currently active virtual environment, but leave the +# deactivate function in place. +deactivate -nondestructive + +# Now set the environment variable VIRTUAL_ENV, used by many tools to determine +# that there is an activated venv. +$env:VIRTUAL_ENV = $VenvDir + +if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) { + + Write-Verbose "Setting prompt to '$Prompt'" + + # Set the prompt to include the env name + # Make sure _OLD_VIRTUAL_PROMPT is global + function global:_OLD_VIRTUAL_PROMPT { "" } + Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT + New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt + + function global:prompt { + Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) " + _OLD_VIRTUAL_PROMPT + } + $env:VIRTUAL_ENV_PROMPT = $Prompt +} + +# Clear PYTHONHOME +if (Test-Path -Path Env:PYTHONHOME) { + Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME + Remove-Item -Path Env:PYTHONHOME +} + +# Add the venv to the PATH +Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH +$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH" diff --git a/.venv2/Scripts/activate b/.venv2/Scripts/activate new file mode 100644 index 0000000..db11ccd --- /dev/null +++ b/.venv2/Scripts/activate @@ -0,0 +1,70 @@ +# This file must be used with "source bin/activate" *from bash* +# You cannot run it directly + +deactivate () { + # reset old environment variables + if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then + PATH="${_OLD_VIRTUAL_PATH:-}" + export PATH + unset _OLD_VIRTUAL_PATH + fi + if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then + PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}" + export PYTHONHOME + unset _OLD_VIRTUAL_PYTHONHOME + fi + + # Call hash to forget past commands. Without forgetting + # past commands the $PATH changes we made may not be respected + hash -r 2> /dev/null + + if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then + PS1="${_OLD_VIRTUAL_PS1:-}" + export PS1 + unset _OLD_VIRTUAL_PS1 + fi + + unset VIRTUAL_ENV + unset VIRTUAL_ENV_PROMPT + if [ ! "${1:-}" = "nondestructive" ] ; then + # Self destruct! + unset -f deactivate + fi +} + +# unset irrelevant variables +deactivate nondestructive + +# on Windows, a path can contain colons and backslashes and has to be converted: +if [ "${OSTYPE:-}" = "cygwin" ] || [ "${OSTYPE:-}" = "msys" ] ; then + # transform D:\path\to\venv to /d/path/to/venv on MSYS + # and to /cygdrive/d/path/to/venv on Cygwin + export VIRTUAL_ENV=$(cygpath "C:\Users\Kevin\Documents\Work\SCP\AI\RAG\appl-kgraph\.venv2") +else + # use the path as-is + export VIRTUAL_ENV="C:\Users\Kevin\Documents\Work\SCP\AI\RAG\appl-kgraph\.venv2" +fi + +_OLD_VIRTUAL_PATH="$PATH" +PATH="$VIRTUAL_ENV/Scripts:$PATH" +export PATH + +# unset PYTHONHOME if set +# this will fail if PYTHONHOME is set to the empty string (which is bad anyway) +# could use `if (set -u; : $PYTHONHOME) ;` in bash +if [ -n "${PYTHONHOME:-}" ] ; then + _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}" + unset PYTHONHOME +fi + +if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then + _OLD_VIRTUAL_PS1="${PS1:-}" + PS1="(.venv2) ${PS1:-}" + export PS1 + VIRTUAL_ENV_PROMPT="(.venv2) " + export VIRTUAL_ENV_PROMPT +fi + +# Call hash to forget past commands. Without forgetting +# past commands the $PATH changes we made may not be respected +hash -r 2> /dev/null diff --git a/.venv2/Scripts/activate.bat b/.venv2/Scripts/activate.bat new file mode 100644 index 0000000..9dd51fc --- /dev/null +++ b/.venv2/Scripts/activate.bat @@ -0,0 +1,34 @@ +@echo off + +rem This file is UTF-8 encoded, so we need to update the current code page while executing it +for /f "tokens=2 delims=:." %%a in ('"%SystemRoot%\System32\chcp.com"') do ( + set _OLD_CODEPAGE=%%a +) +if defined _OLD_CODEPAGE ( + "%SystemRoot%\System32\chcp.com" 65001 > nul +) + +set VIRTUAL_ENV=C:\Users\Kevin\Documents\Work\SCP\AI\RAG\appl-kgraph\.venv2 + +if not defined PROMPT set PROMPT=$P$G + +if defined _OLD_VIRTUAL_PROMPT set PROMPT=%_OLD_VIRTUAL_PROMPT% +if defined _OLD_VIRTUAL_PYTHONHOME set PYTHONHOME=%_OLD_VIRTUAL_PYTHONHOME% + +set _OLD_VIRTUAL_PROMPT=%PROMPT% +set PROMPT=(.venv2) %PROMPT% + +if defined PYTHONHOME set _OLD_VIRTUAL_PYTHONHOME=%PYTHONHOME% +set PYTHONHOME= + +if defined _OLD_VIRTUAL_PATH set PATH=%_OLD_VIRTUAL_PATH% +if not defined _OLD_VIRTUAL_PATH set _OLD_VIRTUAL_PATH=%PATH% + +set PATH=%VIRTUAL_ENV%\Scripts;%PATH% +set VIRTUAL_ENV_PROMPT=(.venv2) + +:END +if defined _OLD_CODEPAGE ( + "%SystemRoot%\System32\chcp.com" %_OLD_CODEPAGE% > nul + set _OLD_CODEPAGE= +) diff --git a/.venv2/Scripts/chroma.exe b/.venv2/Scripts/chroma.exe new file mode 100644 index 0000000..43ccf99 Binary files /dev/null and b/.venv2/Scripts/chroma.exe differ diff --git a/.venv2/Scripts/coloredlogs.exe b/.venv2/Scripts/coloredlogs.exe new file mode 100644 index 0000000..ef79943 Binary files /dev/null and b/.venv2/Scripts/coloredlogs.exe differ diff --git a/.venv2/Scripts/deactivate.bat b/.venv2/Scripts/deactivate.bat new file mode 100644 index 0000000..44dae49 --- /dev/null +++ b/.venv2/Scripts/deactivate.bat @@ -0,0 +1,22 @@ +@echo off + +if defined _OLD_VIRTUAL_PROMPT ( + set "PROMPT=%_OLD_VIRTUAL_PROMPT%" +) +set _OLD_VIRTUAL_PROMPT= + +if defined _OLD_VIRTUAL_PYTHONHOME ( + set "PYTHONHOME=%_OLD_VIRTUAL_PYTHONHOME%" + set _OLD_VIRTUAL_PYTHONHOME= +) + +if defined _OLD_VIRTUAL_PATH ( + set "PATH=%_OLD_VIRTUAL_PATH%" +) + +set _OLD_VIRTUAL_PATH= + +set VIRTUAL_ENV= +set VIRTUAL_ENV_PROMPT= + +:END diff --git a/.venv2/Scripts/distro.exe b/.venv2/Scripts/distro.exe new file mode 100644 index 0000000..00cde8d Binary files /dev/null and b/.venv2/Scripts/distro.exe differ diff --git a/.venv2/Scripts/docx2pdf.exe b/.venv2/Scripts/docx2pdf.exe new file mode 100644 index 0000000..ffa497a Binary files /dev/null and b/.venv2/Scripts/docx2pdf.exe differ diff --git a/.venv2/Scripts/dotenv.exe b/.venv2/Scripts/dotenv.exe new file mode 100644 index 0000000..fbd2c43 Binary files /dev/null and b/.venv2/Scripts/dotenv.exe differ diff --git a/.venv2/Scripts/f2py.exe b/.venv2/Scripts/f2py.exe new file mode 100644 index 0000000..0003352 Binary files /dev/null and b/.venv2/Scripts/f2py.exe differ diff --git a/.venv2/Scripts/fastapi.exe b/.venv2/Scripts/fastapi.exe new file mode 100644 index 0000000..a295d41 Binary files /dev/null and b/.venv2/Scripts/fastapi.exe differ diff --git a/.venv2/Scripts/fonttools.exe b/.venv2/Scripts/fonttools.exe new file mode 100644 index 0000000..6ed0f78 Binary files /dev/null and b/.venv2/Scripts/fonttools.exe differ diff --git a/.venv2/Scripts/gradio.exe b/.venv2/Scripts/gradio.exe new file mode 100644 index 0000000..c81eef3 Binary files /dev/null and b/.venv2/Scripts/gradio.exe differ diff --git a/.venv2/Scripts/hf.exe b/.venv2/Scripts/hf.exe new file mode 100644 index 0000000..6a410a8 Binary files /dev/null and b/.venv2/Scripts/hf.exe differ diff --git a/.venv2/Scripts/httpx.exe b/.venv2/Scripts/httpx.exe new file mode 100644 index 0000000..3556585 Binary files /dev/null and b/.venv2/Scripts/httpx.exe differ diff --git a/.venv2/Scripts/huggingface-cli.exe b/.venv2/Scripts/huggingface-cli.exe new file mode 100644 index 0000000..03f13c0 Binary files /dev/null and b/.venv2/Scripts/huggingface-cli.exe differ diff --git a/.venv2/Scripts/humanfriendly.exe b/.venv2/Scripts/humanfriendly.exe new file mode 100644 index 0000000..746633e Binary files /dev/null and b/.venv2/Scripts/humanfriendly.exe differ diff --git a/.venv2/Scripts/ipython.exe b/.venv2/Scripts/ipython.exe new file mode 100644 index 0000000..c204ca2 Binary files /dev/null and b/.venv2/Scripts/ipython.exe differ diff --git a/.venv2/Scripts/ipython3.exe b/.venv2/Scripts/ipython3.exe new file mode 100644 index 0000000..c204ca2 Binary files /dev/null and b/.venv2/Scripts/ipython3.exe differ diff --git a/.venv2/Scripts/isympy.exe b/.venv2/Scripts/isympy.exe new file mode 100644 index 0000000..ca9065e Binary files /dev/null and b/.venv2/Scripts/isympy.exe differ diff --git a/.venv2/Scripts/jsondiff b/.venv2/Scripts/jsondiff new file mode 100644 index 0000000..cb80f7f --- /dev/null +++ b/.venv2/Scripts/jsondiff @@ -0,0 +1,41 @@ +#!C:\Users\Kevin\Documents\Work\SCP\AI\RAG\appl-kgraph\.venv2\Scripts\python.exe +# -*- coding: utf-8 -*- + +from __future__ import print_function + +import sys +import json +import jsonpatch +import argparse + + +parser = argparse.ArgumentParser(description='Diff two JSON files') +parser.add_argument('FILE1', type=argparse.FileType('r')) +parser.add_argument('FILE2', type=argparse.FileType('r')) +parser.add_argument('--indent', type=int, default=None, + help='Indent output by n spaces') +parser.add_argument('-u', '--preserve-unicode', action='store_true', + help='Output Unicode character as-is without using Code Point') +parser.add_argument('-v', '--version', action='version', + version='%(prog)s ' + jsonpatch.__version__) + + +def main(): + try: + diff_files() + except KeyboardInterrupt: + sys.exit(1) + + +def diff_files(): + """ Diffs two JSON files and prints a patch """ + args = parser.parse_args() + doc1 = json.load(args.FILE1) + doc2 = json.load(args.FILE2) + patch = jsonpatch.make_patch(doc1, doc2) + if patch.patch: + print(json.dumps(patch.patch, indent=args.indent, ensure_ascii=not(args.preserve_unicode))) + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/.venv2/Scripts/jsonpatch b/.venv2/Scripts/jsonpatch new file mode 100644 index 0000000..e528717 --- /dev/null +++ b/.venv2/Scripts/jsonpatch @@ -0,0 +1,107 @@ +#!C:\Users\Kevin\Documents\Work\SCP\AI\RAG\appl-kgraph\.venv2\Scripts\python.exe +# -*- coding: utf-8 -*- + +import sys +import os.path +import json +import jsonpatch +import tempfile +import argparse + + +parser = argparse.ArgumentParser( + description='Apply a JSON patch on a JSON file') +parser.add_argument('ORIGINAL', type=argparse.FileType('r'), + help='Original file') +parser.add_argument('PATCH', type=argparse.FileType('r'), + nargs='?', default=sys.stdin, + help='Patch file (read from stdin if omitted)') +parser.add_argument('--indent', type=int, default=None, + help='Indent output by n spaces') +parser.add_argument('-b', '--backup', action='store_true', + help='Back up ORIGINAL if modifying in-place') +parser.add_argument('-i', '--in-place', action='store_true', + help='Modify ORIGINAL in-place instead of to stdout') +parser.add_argument('-v', '--version', action='version', + version='%(prog)s ' + jsonpatch.__version__) +parser.add_argument('-u', '--preserve-unicode', action='store_true', + help='Output Unicode character as-is without using Code Point') + +def main(): + try: + patch_files() + except KeyboardInterrupt: + sys.exit(1) + + +def patch_files(): + """ Diffs two JSON files and prints a patch """ + args = parser.parse_args() + doc = json.load(args.ORIGINAL) + patch = json.load(args.PATCH) + result = jsonpatch.apply_patch(doc, patch) + + if args.in_place: + dirname = os.path.abspath(os.path.dirname(args.ORIGINAL.name)) + + try: + # Attempt to replace the file atomically. We do this by + # creating a temporary file in the same directory as the + # original file so we can atomically move the new file over + # the original later. (This is done in the same directory + # because atomic renames do not work across mount points.) + + fd, pathname = tempfile.mkstemp(dir=dirname) + fp = os.fdopen(fd, 'w') + atomic = True + + except OSError: + # We failed to create the temporary file for an atomic + # replace, so fall back to non-atomic mode by backing up + # the original (if desired) and writing a new file. + + if args.backup: + os.rename(args.ORIGINAL.name, args.ORIGINAL.name + '.orig') + fp = open(args.ORIGINAL.name, 'w') + atomic = False + + else: + # Since we're not replacing the original file in-place, write + # the modified JSON to stdout instead. + + fp = sys.stdout + + # By this point we have some sort of file object we can write the + # modified JSON to. + + json.dump(result, fp, indent=args.indent, ensure_ascii=not(args.preserve_unicode)) + fp.write('\n') + + if args.in_place: + # Close the new file. If we aren't replacing atomically, this + # is our last step, since everything else is already in place. + + fp.close() + + if atomic: + try: + # Complete the atomic replace by linking the original + # to a backup (if desired), fixing up the permissions + # on the temporary file, and moving it into place. + + if args.backup: + os.link(args.ORIGINAL.name, args.ORIGINAL.name + '.orig') + os.chmod(pathname, os.stat(args.ORIGINAL.name).st_mode) + os.rename(pathname, args.ORIGINAL.name) + + except OSError: + # In the event we could not actually do the atomic + # replace, unlink the original to move it out of the + # way and finally move the temporary file into place. + + os.unlink(args.ORIGINAL.name) + os.rename(pathname, args.ORIGINAL.name) + + +if __name__ == "__main__": + main() diff --git a/.venv2/Scripts/jsonpointer b/.venv2/Scripts/jsonpointer new file mode 100644 index 0000000..69feca9 --- /dev/null +++ b/.venv2/Scripts/jsonpointer @@ -0,0 +1,67 @@ +#!C:\Users\Kevin\Documents\Work\SCP\AI\RAG\appl-kgraph\.venv2\Scripts\python.exe +# -*- coding: utf-8 -*- + + +import argparse +import json +import sys + +import jsonpointer + +parser = argparse.ArgumentParser( + description='Resolve a JSON pointer on JSON files') + +# Accept pointer as argument or as file +ptr_group = parser.add_mutually_exclusive_group(required=True) + +ptr_group.add_argument('-f', '--pointer-file', type=argparse.FileType('r'), + nargs='?', + help='File containing a JSON pointer expression') + +ptr_group.add_argument('POINTER', type=str, nargs='?', + help='A JSON pointer expression') + +parser.add_argument('FILE', type=argparse.FileType('r'), nargs='+', + help='Files for which the pointer should be resolved') +parser.add_argument('--indent', type=int, default=None, + help='Indent output by n spaces') +parser.add_argument('-v', '--version', action='version', + version='%(prog)s ' + jsonpointer.__version__) + + +def main(): + try: + resolve_files() + except KeyboardInterrupt: + sys.exit(1) + + +def parse_pointer(args): + if args.POINTER: + ptr = args.POINTER + elif args.pointer_file: + ptr = args.pointer_file.read().strip() + else: + parser.print_usage() + sys.exit(1) + + return ptr + + +def resolve_files(): + """ Resolve a JSON pointer on JSON files """ + args = parser.parse_args() + + ptr = parse_pointer(args) + + for f in args.FILE: + doc = json.load(f) + try: + result = jsonpointer.resolve_pointer(doc, ptr) + print(json.dumps(result, indent=args.indent)) + except jsonpointer.JsonPointerException as e: + print('Could not resolve pointer: %s' % str(e), file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/.venv2/Scripts/jsonschema.exe b/.venv2/Scripts/jsonschema.exe new file mode 100644 index 0000000..f3c8336 Binary files /dev/null and b/.venv2/Scripts/jsonschema.exe differ diff --git a/.venv2/Scripts/markdown-it.exe b/.venv2/Scripts/markdown-it.exe new file mode 100644 index 0000000..e3f6db7 Binary files /dev/null and b/.venv2/Scripts/markdown-it.exe differ diff --git a/.venv2/Scripts/normalizer.exe b/.venv2/Scripts/normalizer.exe new file mode 100644 index 0000000..64d6ed3 Binary files /dev/null and b/.venv2/Scripts/normalizer.exe differ diff --git a/.venv2/Scripts/numpy-config.exe b/.venv2/Scripts/numpy-config.exe new file mode 100644 index 0000000..1201da7 Binary files /dev/null and b/.venv2/Scripts/numpy-config.exe differ diff --git a/.venv2/Scripts/onnxruntime_test.exe b/.venv2/Scripts/onnxruntime_test.exe new file mode 100644 index 0000000..10f5d34 Binary files /dev/null and b/.venv2/Scripts/onnxruntime_test.exe differ diff --git a/.venv2/Scripts/openai.exe b/.venv2/Scripts/openai.exe new file mode 100644 index 0000000..12e5395 Binary files /dev/null and b/.venv2/Scripts/openai.exe differ diff --git a/.venv2/Scripts/pip.exe b/.venv2/Scripts/pip.exe new file mode 100644 index 0000000..550cac4 Binary files /dev/null and b/.venv2/Scripts/pip.exe differ diff --git a/.venv2/Scripts/pip3.12.exe b/.venv2/Scripts/pip3.12.exe new file mode 100644 index 0000000..550cac4 Binary files /dev/null and b/.venv2/Scripts/pip3.12.exe differ diff --git a/.venv2/Scripts/pip3.exe b/.venv2/Scripts/pip3.exe new file mode 100644 index 0000000..550cac4 Binary files /dev/null and b/.venv2/Scripts/pip3.exe differ diff --git a/.venv2/Scripts/pybase64.exe b/.venv2/Scripts/pybase64.exe new file mode 100644 index 0000000..c593d9d Binary files /dev/null and b/.venv2/Scripts/pybase64.exe differ diff --git a/.venv2/Scripts/pyftmerge.exe b/.venv2/Scripts/pyftmerge.exe new file mode 100644 index 0000000..b733be4 Binary files /dev/null and b/.venv2/Scripts/pyftmerge.exe differ diff --git a/.venv2/Scripts/pyftsubset.exe b/.venv2/Scripts/pyftsubset.exe new file mode 100644 index 0000000..1218634 Binary files /dev/null and b/.venv2/Scripts/pyftsubset.exe differ diff --git a/.venv2/Scripts/pygmentize.exe b/.venv2/Scripts/pygmentize.exe new file mode 100644 index 0000000..24010ce Binary files /dev/null and b/.venv2/Scripts/pygmentize.exe differ diff --git a/.venv2/Scripts/pymupdf.exe b/.venv2/Scripts/pymupdf.exe new file mode 100644 index 0000000..66eaa4b Binary files /dev/null and b/.venv2/Scripts/pymupdf.exe differ diff --git a/.venv2/Scripts/pyproject-build.exe b/.venv2/Scripts/pyproject-build.exe new file mode 100644 index 0000000..f6cdc9e Binary files /dev/null and b/.venv2/Scripts/pyproject-build.exe differ diff --git a/.venv2/Scripts/pyrsa-decrypt.exe b/.venv2/Scripts/pyrsa-decrypt.exe new file mode 100644 index 0000000..fb14df9 Binary files /dev/null and b/.venv2/Scripts/pyrsa-decrypt.exe differ diff --git a/.venv2/Scripts/pyrsa-encrypt.exe b/.venv2/Scripts/pyrsa-encrypt.exe new file mode 100644 index 0000000..8575f13 Binary files /dev/null and b/.venv2/Scripts/pyrsa-encrypt.exe differ diff --git a/.venv2/Scripts/pyrsa-keygen.exe b/.venv2/Scripts/pyrsa-keygen.exe new file mode 100644 index 0000000..b79ffc5 Binary files /dev/null and b/.venv2/Scripts/pyrsa-keygen.exe differ diff --git a/.venv2/Scripts/pyrsa-priv2pub.exe b/.venv2/Scripts/pyrsa-priv2pub.exe new file mode 100644 index 0000000..d246a18 Binary files /dev/null and b/.venv2/Scripts/pyrsa-priv2pub.exe differ diff --git a/.venv2/Scripts/pyrsa-sign.exe b/.venv2/Scripts/pyrsa-sign.exe new file mode 100644 index 0000000..c266f67 Binary files /dev/null and b/.venv2/Scripts/pyrsa-sign.exe differ diff --git a/.venv2/Scripts/pyrsa-verify.exe b/.venv2/Scripts/pyrsa-verify.exe new file mode 100644 index 0000000..bd87775 Binary files /dev/null and b/.venv2/Scripts/pyrsa-verify.exe differ diff --git a/.venv2/Scripts/python.exe b/.venv2/Scripts/python.exe new file mode 100644 index 0000000..1942b9a Binary files /dev/null and b/.venv2/Scripts/python.exe differ diff --git a/.venv2/Scripts/pythonw.exe b/.venv2/Scripts/pythonw.exe new file mode 100644 index 0000000..e770f9d Binary files /dev/null and b/.venv2/Scripts/pythonw.exe differ diff --git a/.venv2/Scripts/pywin32_postinstall.exe b/.venv2/Scripts/pywin32_postinstall.exe new file mode 100644 index 0000000..5a34b0f Binary files /dev/null and b/.venv2/Scripts/pywin32_postinstall.exe differ diff --git a/.venv2/Scripts/pywin32_postinstall.py b/.venv2/Scripts/pywin32_postinstall.py new file mode 100644 index 0000000..7b1a1fd --- /dev/null +++ b/.venv2/Scripts/pywin32_postinstall.py @@ -0,0 +1,733 @@ +# postinstall script for pywin32 +# +# copies pywintypesXX.dll and pythoncomXX.dll into the system directory, +# and creates a pth file +import argparse +import glob +import os +import shutil +import sys +import sysconfig +import tempfile +import winreg + +tee_f = open( + os.path.join( + tempfile.gettempdir(), # Send output somewhere so it can be found if necessary... + "pywin32_postinstall.log", + ), + "w", +) + + +class Tee: + def __init__(self, file): + self.f = file + + def write(self, what): + if self.f is not None: + try: + self.f.write(what.replace("\n", "\r\n")) + except OSError: + pass + tee_f.write(what) + + def flush(self): + if self.f is not None: + try: + self.f.flush() + except OSError: + pass + tee_f.flush() + + +sys.stderr = Tee(sys.stderr) +sys.stdout = Tee(sys.stdout) + +com_modules = [ + # module_name, class_names + ("win32com.servers.interp", "Interpreter"), + ("win32com.servers.dictionary", "DictionaryPolicy"), + ("win32com.axscript.client.pyscript", "PyScript"), +] + +# Is this a 'silent' install - ie, avoid all dialogs. +# Different than 'verbose' +silent = 0 + +# Verbosity of output messages. +verbose = 1 + +root_key_name = "Software\\Python\\PythonCore\\" + sys.winver + + +def get_root_hkey(): + try: + winreg.OpenKey( + winreg.HKEY_LOCAL_MACHINE, root_key_name, 0, winreg.KEY_CREATE_SUB_KEY + ) + return winreg.HKEY_LOCAL_MACHINE + except OSError: + # Either not exist, or no permissions to create subkey means + # must be HKCU + return winreg.HKEY_CURRENT_USER + + +# Create a function with the same signature as create_shortcut +# previously provided by bdist_wininst +def create_shortcut( + path, description, filename, arguments="", workdir="", iconpath="", iconindex=0 +): + import pythoncom + from win32com.shell import shell + + ilink = pythoncom.CoCreateInstance( + shell.CLSID_ShellLink, + None, + pythoncom.CLSCTX_INPROC_SERVER, + shell.IID_IShellLink, + ) + ilink.SetPath(path) + ilink.SetDescription(description) + if arguments: + ilink.SetArguments(arguments) + if workdir: + ilink.SetWorkingDirectory(workdir) + if iconpath or iconindex: + ilink.SetIconLocation(iconpath, iconindex) + # now save it. + ipf = ilink.QueryInterface(pythoncom.IID_IPersistFile) + ipf.Save(filename, 0) + + +# Support the same list of "path names" as bdist_wininst used to +def get_special_folder_path(path_name): + from win32com.shell import shell, shellcon + + for maybe in """ + CSIDL_COMMON_STARTMENU CSIDL_STARTMENU CSIDL_COMMON_APPDATA + CSIDL_LOCAL_APPDATA CSIDL_APPDATA CSIDL_COMMON_DESKTOPDIRECTORY + CSIDL_DESKTOPDIRECTORY CSIDL_COMMON_STARTUP CSIDL_STARTUP + CSIDL_COMMON_PROGRAMS CSIDL_PROGRAMS CSIDL_PROGRAM_FILES_COMMON + CSIDL_PROGRAM_FILES CSIDL_FONTS""".split(): + if maybe == path_name: + csidl = getattr(shellcon, maybe) + return shell.SHGetSpecialFolderPath(0, csidl, False) + raise ValueError(f"{path_name} is an unknown path ID") + + +def CopyTo(desc, src, dest): + import win32api + import win32con + + while 1: + try: + win32api.CopyFile(src, dest, 0) + return + except win32api.error as details: + if details.winerror == 5: # access denied - user not admin. + raise + if silent: + # Running silent mode - just re-raise the error. + raise + full_desc = ( + f"Error {desc}\n\n" + "If you have any Python applications running, " + f"please close them now\nand select 'Retry'\n\n{details.strerror}" + ) + rc = win32api.MessageBox( + 0, full_desc, "Installation Error", win32con.MB_ABORTRETRYIGNORE + ) + if rc == win32con.IDABORT: + raise + elif rc == win32con.IDIGNORE: + return + # else retry - around we go again. + + +# We need to import win32api to determine the Windows system directory, +# so we can copy our system files there - but importing win32api will +# load the pywintypes.dll already in the system directory preventing us +# from updating them! +# So, we pull the same trick pywintypes.py does, but it loads from +# our pywintypes_system32 directory. +def LoadSystemModule(lib_dir, modname): + # See if this is a debug build. + import importlib.machinery + import importlib.util + + suffix = "_d" if "_d.pyd" in importlib.machinery.EXTENSION_SUFFIXES else "" + filename = "%s%d%d%s.dll" % ( + modname, + sys.version_info.major, + sys.version_info.minor, + suffix, + ) + filename = os.path.join(lib_dir, "pywin32_system32", filename) + loader = importlib.machinery.ExtensionFileLoader(modname, filename) + spec = importlib.machinery.ModuleSpec(name=modname, loader=loader, origin=filename) + mod = importlib.util.module_from_spec(spec) + loader.exec_module(mod) + + +def SetPyKeyVal(key_name, value_name, value): + root_hkey = get_root_hkey() + root_key = winreg.OpenKey(root_hkey, root_key_name) + try: + my_key = winreg.CreateKey(root_key, key_name) + try: + winreg.SetValueEx(my_key, value_name, 0, winreg.REG_SZ, value) + if verbose: + print(f"-> {root_key_name}\\{key_name}[{value_name}]={value!r}") + finally: + my_key.Close() + finally: + root_key.Close() + + +def UnsetPyKeyVal(key_name, value_name, delete_key=False): + root_hkey = get_root_hkey() + root_key = winreg.OpenKey(root_hkey, root_key_name) + try: + my_key = winreg.OpenKey(root_key, key_name, 0, winreg.KEY_SET_VALUE) + try: + winreg.DeleteValue(my_key, value_name) + if verbose: + print(f"-> DELETE {root_key_name}\\{key_name}[{value_name}]") + finally: + my_key.Close() + if delete_key: + winreg.DeleteKey(root_key, key_name) + if verbose: + print(f"-> DELETE {root_key_name}\\{key_name}") + except OSError as why: + winerror = getattr(why, "winerror", why.errno) + if winerror != 2: # file not found + raise + finally: + root_key.Close() + + +def RegisterCOMObjects(register=True): + import win32com.server.register + + if register: + func = win32com.server.register.RegisterClasses + else: + func = win32com.server.register.UnregisterClasses + flags = {} + if not verbose: + flags["quiet"] = 1 + for module, klass_name in com_modules: + __import__(module) + mod = sys.modules[module] + flags["finalize_register"] = getattr(mod, "DllRegisterServer", None) + flags["finalize_unregister"] = getattr(mod, "DllUnregisterServer", None) + klass = getattr(mod, klass_name) + func(klass, **flags) + + +def RegisterHelpFile(register=True, lib_dir=None): + if lib_dir is None: + lib_dir = sysconfig.get_paths()["platlib"] + if register: + # Register the .chm help file. + chm_file = os.path.join(lib_dir, "PyWin32.chm") + if os.path.isfile(chm_file): + # This isn't recursive, so if 'Help' doesn't exist, we croak + SetPyKeyVal("Help", None, None) + SetPyKeyVal("Help\\Pythonwin Reference", None, chm_file) + return chm_file + else: + print("NOTE: PyWin32.chm can not be located, so has not been registered") + else: + UnsetPyKeyVal("Help\\Pythonwin Reference", None, delete_key=True) + return None + + +def RegisterPythonwin(register=True, lib_dir=None): + """Add (or remove) Pythonwin to context menu for python scripts. + ??? Should probably also add Edit command for pys files also. + Also need to remove these keys on uninstall, but there's no function + to add registry entries to uninstall log ??? + """ + import os + + if lib_dir is None: + lib_dir = sysconfig.get_paths()["platlib"] + classes_root = get_root_hkey() + ## Installer executable doesn't seem to pass anything to postinstall script indicating if it's a debug build + pythonwin_exe = os.path.join(lib_dir, "Pythonwin", "Pythonwin.exe") + pythonwin_edit_command = pythonwin_exe + ' -edit "%1"' + + keys_vals = [ + ( + "Software\\Microsoft\\Windows\\CurrentVersion\\App Paths\\Pythonwin.exe", + "", + pythonwin_exe, + ), + ( + "Software\\Classes\\Python.File\\shell\\Edit with Pythonwin", + "command", + pythonwin_edit_command, + ), + ( + "Software\\Classes\\Python.NoConFile\\shell\\Edit with Pythonwin", + "command", + pythonwin_edit_command, + ), + ] + + try: + if register: + for key, sub_key, val in keys_vals: + ## Since winreg only uses the character Api functions, this can fail if Python + ## is installed to a path containing non-ascii characters + hkey = winreg.CreateKey(classes_root, key) + if sub_key: + hkey = winreg.CreateKey(hkey, sub_key) + winreg.SetValueEx(hkey, None, 0, winreg.REG_SZ, val) + hkey.Close() + else: + for key, sub_key, val in keys_vals: + try: + if sub_key: + hkey = winreg.OpenKey(classes_root, key) + winreg.DeleteKey(hkey, sub_key) + hkey.Close() + winreg.DeleteKey(classes_root, key) + except OSError as why: + winerror = getattr(why, "winerror", why.errno) + if winerror != 2: # file not found + raise + finally: + # tell windows about the change + from win32com.shell import shell, shellcon + + shell.SHChangeNotify( + shellcon.SHCNE_ASSOCCHANGED, shellcon.SHCNF_IDLIST, None, None + ) + + +def get_shortcuts_folder(): + if get_root_hkey() == winreg.HKEY_LOCAL_MACHINE: + try: + fldr = get_special_folder_path("CSIDL_COMMON_PROGRAMS") + except OSError: + # No CSIDL_COMMON_PROGRAMS on this platform + fldr = get_special_folder_path("CSIDL_PROGRAMS") + else: + # non-admin install - always goes in this user's start menu. + fldr = get_special_folder_path("CSIDL_PROGRAMS") + + try: + install_group = winreg.QueryValue( + get_root_hkey(), root_key_name + "\\InstallPath\\InstallGroup" + ) + except OSError: + install_group = "Python %d.%d" % ( + sys.version_info.major, + sys.version_info.minor, + ) + return os.path.join(fldr, install_group) + + +# Get the system directory, which may be the Wow64 directory if we are a 32bit +# python on a 64bit OS. +def get_system_dir(): + import win32api # we assume this exists. + + try: + import pythoncom + import win32process + from win32com.shell import shell, shellcon + + try: + if win32process.IsWow64Process(): + return shell.SHGetSpecialFolderPath(0, shellcon.CSIDL_SYSTEMX86) + return shell.SHGetSpecialFolderPath(0, shellcon.CSIDL_SYSTEM) + except (pythoncom.com_error, win32process.error): + return win32api.GetSystemDirectory() + except ImportError: + return win32api.GetSystemDirectory() + + +def fixup_dbi(): + # We used to have a dbi.pyd with our .pyd files, but now have a .py file. + # If the user didn't uninstall, they will find the .pyd which will cause + # problems - so handle that. + import win32api + import win32con + + pyd_name = os.path.join(os.path.dirname(win32api.__file__), "dbi.pyd") + pyd_d_name = os.path.join(os.path.dirname(win32api.__file__), "dbi_d.pyd") + py_name = os.path.join(os.path.dirname(win32con.__file__), "dbi.py") + for this_pyd in (pyd_name, pyd_d_name): + this_dest = this_pyd + ".old" + if os.path.isfile(this_pyd) and os.path.isfile(py_name): + try: + if os.path.isfile(this_dest): + print( + f"Old dbi '{this_dest}' already exists - deleting '{this_pyd}'" + ) + os.remove(this_pyd) + else: + os.rename(this_pyd, this_dest) + print(f"renamed '{this_pyd}'->'{this_pyd}.old'") + except OSError as exc: + print(f"FAILED to rename '{this_pyd}': {exc}") + + +def install(lib_dir): + import traceback + + # The .pth file is now installed as a regular file. + # Create the .pth file in the site-packages dir, and use only relative paths + # We used to write a .pth directly to sys.prefix - clobber it. + if os.path.isfile(os.path.join(sys.prefix, "pywin32.pth")): + os.unlink(os.path.join(sys.prefix, "pywin32.pth")) + # The .pth may be new and therefore not loaded in this session. + # Setup the paths just in case. + for name in "win32 win32\\lib Pythonwin".split(): + sys.path.append(os.path.join(lib_dir, name)) + # It is possible people with old versions installed with still have + # pywintypes and pythoncom registered. We no longer need this, and stale + # entries hurt us. + for name in "pythoncom pywintypes".split(): + keyname = "Software\\Python\\PythonCore\\" + sys.winver + "\\Modules\\" + name + for root in winreg.HKEY_LOCAL_MACHINE, winreg.HKEY_CURRENT_USER: + try: + winreg.DeleteKey(root, keyname + "\\Debug") + except OSError: + pass + try: + winreg.DeleteKey(root, keyname) + except OSError: + pass + LoadSystemModule(lib_dir, "pywintypes") + LoadSystemModule(lib_dir, "pythoncom") + import win32api + + # and now we can get the system directory: + files = glob.glob(os.path.join(lib_dir, "pywin32_system32\\*.*")) + if not files: + raise RuntimeError("No system files to copy!!") + # Try the system32 directory first - if that fails due to "access denied", + # it implies a non-admin user, and we use sys.prefix + for dest_dir in [get_system_dir(), sys.prefix]: + # and copy some files over there + worked = 0 + try: + for fname in files: + base = os.path.basename(fname) + dst = os.path.join(dest_dir, base) + CopyTo("installing %s" % base, fname, dst) + if verbose: + print(f"Copied {base} to {dst}") + worked = 1 + # Nuke any other versions that may exist - having + # duplicates causes major headaches. + bad_dest_dirs = [ + os.path.join(sys.prefix, "Library\\bin"), + os.path.join(sys.prefix, "Lib\\site-packages\\win32"), + ] + if dest_dir != sys.prefix: + bad_dest_dirs.append(sys.prefix) + for bad_dest_dir in bad_dest_dirs: + bad_fname = os.path.join(bad_dest_dir, base) + if os.path.exists(bad_fname): + # let exceptions go here - delete must succeed + os.unlink(bad_fname) + if worked: + break + except win32api.error as details: + if details.winerror == 5: + # access denied - user not admin - try sys.prefix dir, + # but first check that a version doesn't already exist + # in that place - otherwise that one will still get used! + if os.path.exists(dst): + msg = ( + "The file '%s' exists, but can not be replaced " + "due to insufficient permissions. You must " + "reinstall this software as an Administrator" % dst + ) + print(msg) + raise RuntimeError(msg) + continue + raise + else: + raise RuntimeError( + "You don't have enough permissions to install the system files" + ) + + # Register our demo COM objects. + try: + try: + RegisterCOMObjects() + except win32api.error as details: + if details.winerror != 5: # ERROR_ACCESS_DENIED + raise + print("You do not have the permissions to install COM objects.") + print("The sample COM objects were not registered.") + except Exception: + print("FAILED to register the Python COM objects") + traceback.print_exc() + + # There may be no main Python key in HKCU if, eg, an admin installed + # python itself. + winreg.CreateKey(get_root_hkey(), root_key_name) + + chm_file = None + try: + chm_file = RegisterHelpFile(True, lib_dir) + except Exception: + print("Failed to register help file") + traceback.print_exc() + else: + if verbose: + print("Registered help file") + + # misc other fixups. + fixup_dbi() + + # Register Pythonwin in context menu + try: + RegisterPythonwin(True, lib_dir) + except Exception: + print("Failed to register pythonwin as editor") + traceback.print_exc() + else: + if verbose: + print("Pythonwin has been registered in context menu") + + # Create the win32com\gen_py directory. + make_dir = os.path.join(lib_dir, "win32com", "gen_py") + if not os.path.isdir(make_dir): + if verbose: + print(f"Creating directory {make_dir}") + os.mkdir(make_dir) + + try: + # create shortcuts + # CSIDL_COMMON_PROGRAMS only available works on NT/2000/XP, and + # will fail there if the user has no admin rights. + fldr = get_shortcuts_folder() + # If the group doesn't exist, then we don't make shortcuts - its + # possible that this isn't a "normal" install. + if os.path.isdir(fldr): + dst = os.path.join(fldr, "PythonWin.lnk") + create_shortcut( + os.path.join(lib_dir, "Pythonwin\\Pythonwin.exe"), + "The Pythonwin IDE", + dst, + "", + sys.prefix, + ) + if verbose: + print("Shortcut for Pythonwin created") + # And the docs. + if chm_file: + dst = os.path.join(fldr, "Python for Windows Documentation.lnk") + doc = "Documentation for the PyWin32 extensions" + create_shortcut(chm_file, doc, dst) + if verbose: + print("Shortcut to documentation created") + else: + if verbose: + print(f"Can't install shortcuts - {fldr!r} is not a folder") + except Exception as details: + print(details) + + # importing win32com.client ensures the gen_py dir created - not strictly + # necessary to do now, but this makes the installation "complete" + try: + import win32com.client # noqa + except ImportError: + # Don't let this error sound fatal + pass + print("The pywin32 extensions were successfully installed.") + + +def uninstall(lib_dir): + # First ensure our system modules are loaded from pywin32_system, so + # we can remove the ones we copied... + LoadSystemModule(lib_dir, "pywintypes") + LoadSystemModule(lib_dir, "pythoncom") + + try: + RegisterCOMObjects(False) + except Exception as why: + print(f"Failed to unregister COM objects: {why}") + + try: + RegisterHelpFile(False, lib_dir) + except Exception as why: + print(f"Failed to unregister help file: {why}") + else: + if verbose: + print("Unregistered help file") + + try: + RegisterPythonwin(False, lib_dir) + except Exception as why: + print(f"Failed to unregister Pythonwin: {why}") + else: + if verbose: + print("Unregistered Pythonwin") + + try: + # remove gen_py directory. + gen_dir = os.path.join(lib_dir, "win32com", "gen_py") + if os.path.isdir(gen_dir): + shutil.rmtree(gen_dir) + if verbose: + print(f"Removed directory {gen_dir}") + + # Remove pythonwin compiled "config" files. + pywin_dir = os.path.join(lib_dir, "Pythonwin", "pywin") + for fname in glob.glob(os.path.join(pywin_dir, "*.cfc")): + os.remove(fname) + + # The dbi.pyd.old files we may have created. + try: + os.remove(os.path.join(lib_dir, "win32", "dbi.pyd.old")) + except OSError: + pass + try: + os.remove(os.path.join(lib_dir, "win32", "dbi_d.pyd.old")) + except OSError: + pass + + except Exception as why: + print(f"Failed to remove misc files: {why}") + + try: + fldr = get_shortcuts_folder() + for link in ("PythonWin.lnk", "Python for Windows Documentation.lnk"): + fqlink = os.path.join(fldr, link) + if os.path.isfile(fqlink): + os.remove(fqlink) + if verbose: + print(f"Removed {link}") + except Exception as why: + print(f"Failed to remove shortcuts: {why}") + # Now remove the system32 files. + files = glob.glob(os.path.join(lib_dir, "pywin32_system32\\*.*")) + # Try the system32 directory first - if that fails due to "access denied", + # it implies a non-admin user, and we use sys.prefix + try: + for dest_dir in [get_system_dir(), sys.prefix]: + # and copy some files over there + worked = 0 + for fname in files: + base = os.path.basename(fname) + dst = os.path.join(dest_dir, base) + if os.path.isfile(dst): + try: + os.remove(dst) + worked = 1 + if verbose: + print("Removed file %s" % (dst)) + except Exception: + print(f"FAILED to remove {dst}") + if worked: + break + except Exception as why: + print(f"FAILED to remove system files: {why}") + + +# NOTE: This used to be run from inside the bdist_wininst created binary un/installer. +# From inside the binary installer this script HAD to NOT +# call sys.exit() or raise SystemExit, otherwise the installer would also terminate! +# Out of principle, we're still not using system exits. + + +def verify_destination(location: str) -> str: + location = os.path.abspath(location) + if not os.path.isdir(location): + raise argparse.ArgumentTypeError( + f'Path "{location}" is not an existing directory!' + ) + return location + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="""A post-install script for the pywin32 extensions. + + * Typical usage: + + > python -m pywin32_postinstall -install + + * or (shorter but you don't have control over which python environment is used) + + > pywin32_postinstall -install + + You need to execute this script, with a '-install' parameter, + to ensure the environment is setup correctly to install COM objects, services, etc. + """, + ) + parser.add_argument( + "-install", + default=False, + action="store_true", + help="Configure the Python environment correctly for pywin32.", + ) + parser.add_argument( + "-remove", + default=False, + action="store_true", + help="Try and remove everything that was installed or copied.", + ) + parser.add_argument( + "-wait", + type=int, + help="Wait for the specified process to terminate before starting.", + ) + parser.add_argument( + "-silent", + default=False, + action="store_true", + help='Don\'t display the "Abort/Retry/Ignore" dialog for files in use.', + ) + parser.add_argument( + "-quiet", + default=False, + action="store_true", + help="Don't display progress messages.", + ) + parser.add_argument( + "-destination", + default=sysconfig.get_paths()["platlib"], + type=verify_destination, + help="Location of the PyWin32 installation", + ) + + args = parser.parse_args() + + if not args.quiet: + print(f"Parsed arguments are: {args}") + + if not args.install ^ args.remove: + parser.error("You need to either choose to -install or -remove!") + + if args.wait is not None: + try: + os.waitpid(args.wait, 0) + except OSError: + # child already dead + pass + + silent = args.silent + verbose = not args.quiet + + if args.install: + install(args.destination) + + if args.remove: + uninstall(args.destination) + + +if __name__ == "__main__": + main() diff --git a/.venv2/Scripts/pywin32_testall.exe b/.venv2/Scripts/pywin32_testall.exe new file mode 100644 index 0000000..381c7ca Binary files /dev/null and b/.venv2/Scripts/pywin32_testall.exe differ diff --git a/.venv2/Scripts/pywin32_testall.py b/.venv2/Scripts/pywin32_testall.py new file mode 100644 index 0000000..0880a1d --- /dev/null +++ b/.venv2/Scripts/pywin32_testall.py @@ -0,0 +1,120 @@ +"""A test runner for pywin32""" + +import os +import site +import subprocess +import sys + +# locate the dirs based on where this script is - it may be either in the +# source tree, or in an installed Python 'Scripts' tree. +project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) +site_packages = [site.getusersitepackages()] + site.getsitepackages() + +failures = [] + + +# Run a test using subprocess and wait for the result. +# If we get an returncode != 0, we know that there was an error, but we don't +# abort immediately - we run as many tests as we can. +def run_test(script, cmdline_extras): + dirname, scriptname = os.path.split(script) + # some tests prefer to be run from their directory. + cmd = [sys.executable, "-u", scriptname] + cmdline_extras + print("--- Running '%s' ---" % script) + sys.stdout.flush() + result = subprocess.run(cmd, check=False, cwd=dirname) + print(f"*** Test script '{script}' exited with {result.returncode}") + sys.stdout.flush() + if result.returncode: + failures.append(script) + + +def find_and_run(possible_locations, extras): + for maybe in possible_locations: + if os.path.isfile(maybe): + run_test(maybe, extras) + break + else: + raise RuntimeError( + "Failed to locate a test script in one of %s" % possible_locations + ) + + +def main(): + import argparse + + code_directories = [project_root] + site_packages + + parser = argparse.ArgumentParser( + description="A script to trigger tests in all subprojects of PyWin32." + ) + parser.add_argument( + "-no-user-interaction", + default=False, + action="store_true", + help="(This is now the default - use `-user-interaction` to include them)", + ) + + parser.add_argument( + "-user-interaction", + action="store_true", + help="Include tests which require user interaction", + ) + + parser.add_argument( + "-skip-adodbapi", + default=False, + action="store_true", + help="Skip the adodbapi tests; useful for CI where there's no provider", + ) + + args, remains = parser.parse_known_args() + + # win32, win32ui / Pythonwin + + extras = [] + if args.user_interaction: + extras.append("-user-interaction") + extras.extend(remains) + scripts = [ + "win32/test/testall.py", + "Pythonwin/pywin/test/all.py", + ] + for script in scripts: + maybes = [os.path.join(directory, script) for directory in code_directories] + find_and_run(maybes, extras) + + # win32com + maybes = [ + os.path.join(directory, "win32com", "test", "testall.py") + for directory in [os.path.join(project_root, "com")] + site_packages + ] + extras = remains + ["1"] # only run "level 1" tests in CI + find_and_run(maybes, extras) + + # adodbapi + if not args.skip_adodbapi: + maybes = [ + os.path.join(directory, "adodbapi", "test", "adodbapitest.py") + for directory in code_directories + ] + find_and_run(maybes, remains) + # This script has a hard-coded sql server name in it, (and markh typically + # doesn't have a different server to test on) but there is now supposed to be a server out there on the Internet + # just to run these tests, so try it... + maybes = [ + os.path.join(directory, "adodbapi", "test", "test_adodbapi_dbapi20.py") + for directory in code_directories + ] + find_and_run(maybes, remains) + + if failures: + print("The following scripts failed") + for failure in failures: + print(">", failure) + sys.exit(1) + print("All tests passed \\o/") + + +if __name__ == "__main__": + main() diff --git a/.venv2/Scripts/ruff.exe b/.venv2/Scripts/ruff.exe new file mode 100644 index 0000000..4d63508 Binary files /dev/null and b/.venv2/Scripts/ruff.exe differ diff --git a/.venv2/Scripts/tiny-agents.exe b/.venv2/Scripts/tiny-agents.exe new file mode 100644 index 0000000..2417f3a Binary files /dev/null and b/.venv2/Scripts/tiny-agents.exe differ diff --git a/.venv2/Scripts/tqdm.exe b/.venv2/Scripts/tqdm.exe new file mode 100644 index 0000000..2e50f3b Binary files /dev/null and b/.venv2/Scripts/tqdm.exe differ diff --git a/.venv2/Scripts/ttx.exe b/.venv2/Scripts/ttx.exe new file mode 100644 index 0000000..a8b2691 Binary files /dev/null and b/.venv2/Scripts/ttx.exe differ diff --git a/.venv2/Scripts/typer.exe b/.venv2/Scripts/typer.exe new file mode 100644 index 0000000..d82c9a9 Binary files /dev/null and b/.venv2/Scripts/typer.exe differ diff --git a/.venv2/Scripts/upload_theme.exe b/.venv2/Scripts/upload_theme.exe new file mode 100644 index 0000000..1a95626 Binary files /dev/null and b/.venv2/Scripts/upload_theme.exe differ diff --git a/.venv2/Scripts/uvicorn.exe b/.venv2/Scripts/uvicorn.exe new file mode 100644 index 0000000..0839f1e Binary files /dev/null and b/.venv2/Scripts/uvicorn.exe differ diff --git a/.venv2/Scripts/watchfiles.exe b/.venv2/Scripts/watchfiles.exe new file mode 100644 index 0000000..a34e57e Binary files /dev/null and b/.venv2/Scripts/watchfiles.exe differ diff --git a/.venv2/Scripts/websockets.exe b/.venv2/Scripts/websockets.exe new file mode 100644 index 0000000..5993bc8 Binary files /dev/null and b/.venv2/Scripts/websockets.exe differ diff --git a/.venv2/Scripts/wsdump.exe b/.venv2/Scripts/wsdump.exe new file mode 100644 index 0000000..e959211 Binary files /dev/null and b/.venv2/Scripts/wsdump.exe differ diff --git a/.venv2/share/man/man1/ipython.1 b/.venv2/share/man/man1/ipython.1 new file mode 100644 index 0000000..0f4a191 --- /dev/null +++ b/.venv2/share/man/man1/ipython.1 @@ -0,0 +1,60 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH IPYTHON 1 "July 15, 2011" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) and groff_man(7) +.\" .SH section heading +.\" .SS secondary section heading +.\" +.\" +.\" To preview this page as plain text: nroff -man ipython.1 +.\" +.SH NAME +ipython \- Tools for Interactive Computing in Python. +.SH SYNOPSIS +.B ipython +.RI [ options ] " files" ... + +.B ipython subcommand +.RI [ options ] ... + +.SH DESCRIPTION +An interactive Python shell with automatic history (input and output), dynamic +object introspection, easier configuration, command completion, access to the +system shell, integration with numerical and scientific computing tools, +web notebook, Qt console, and more. + +For more information on how to use IPython, see 'ipython \-\-help', +or 'ipython \-\-help\-all' for all available command\(hyline options. + +.SH "ENVIRONMENT VARIABLES" +.sp +.PP +\fIIPYTHONDIR\fR +.RS 4 +This is the location where IPython stores all its configuration files. The default +is $HOME/.ipython if IPYTHONDIR is not defined. + +You can see the computed value of IPYTHONDIR with `ipython locate`. + +.SH FILES + +IPython uses various configuration files stored in profiles within IPYTHONDIR. +To generate the default configuration files and start configuring IPython, +do 'ipython profile create', and edit '*_config.py' files located in +IPYTHONDIR/profile_default. + +.SH AUTHORS +IPython is written by the IPython Development Team . diff --git a/.venv2/share/man/man1/isympy.1 b/.venv2/share/man/man1/isympy.1 new file mode 100644 index 0000000..0ff9661 --- /dev/null +++ b/.venv2/share/man/man1/isympy.1 @@ -0,0 +1,188 @@ +'\" -*- coding: us-ascii -*- +.if \n(.g .ds T< \\FC +.if \n(.g .ds T> \\F[\n[.fam]] +.de URL +\\$2 \(la\\$1\(ra\\$3 +.. +.if \n(.g .mso www.tmac +.TH isympy 1 2007-10-8 "" "" +.SH NAME +isympy \- interactive shell for SymPy +.SH SYNOPSIS +'nh +.fi +.ad l +\fBisympy\fR \kx +.if (\nx>(\n(.l/2)) .nr x (\n(.l/5) +'in \n(.iu+\nxu +[\fB-c\fR | \fB--console\fR] [\fB-p\fR ENCODING | \fB--pretty\fR ENCODING] [\fB-t\fR TYPE | \fB--types\fR TYPE] [\fB-o\fR ORDER | \fB--order\fR ORDER] [\fB-q\fR | \fB--quiet\fR] [\fB-d\fR | \fB--doctest\fR] [\fB-C\fR | \fB--no-cache\fR] [\fB-a\fR | \fB--auto\fR] [\fB-D\fR | \fB--debug\fR] [ +-- | PYTHONOPTIONS] +'in \n(.iu-\nxu +.ad b +'hy +'nh +.fi +.ad l +\fBisympy\fR \kx +.if (\nx>(\n(.l/2)) .nr x (\n(.l/5) +'in \n(.iu+\nxu +[ +{\fB-h\fR | \fB--help\fR} +| +{\fB-v\fR | \fB--version\fR} +] +'in \n(.iu-\nxu +.ad b +'hy +.SH DESCRIPTION +isympy is a Python shell for SymPy. It is just a normal python shell +(ipython shell if you have the ipython package installed) that executes +the following commands so that you don't have to: +.PP +.nf +\*(T< +>>> from __future__ import division +>>> from sympy import * +>>> x, y, z = symbols("x,y,z") +>>> k, m, n = symbols("k,m,n", integer=True) + \*(T> +.fi +.PP +So starting isympy is equivalent to starting python (or ipython) and +executing the above commands by hand. It is intended for easy and quick +experimentation with SymPy. For more complicated programs, it is recommended +to write a script and import things explicitly (using the "from sympy +import sin, log, Symbol, ..." idiom). +.SH OPTIONS +.TP +\*(T<\fB\-c \fR\*(T>\fISHELL\fR, \*(T<\fB\-\-console=\fR\*(T>\fISHELL\fR +Use the specified shell (python or ipython) as +console backend instead of the default one (ipython +if present or python otherwise). + +Example: isympy -c python + +\fISHELL\fR could be either +\&'ipython' or 'python' +.TP +\*(T<\fB\-p \fR\*(T>\fIENCODING\fR, \*(T<\fB\-\-pretty=\fR\*(T>\fIENCODING\fR +Setup pretty printing in SymPy. By default, the most pretty, unicode +printing is enabled (if the terminal supports it). You can use less +pretty ASCII printing instead or no pretty printing at all. + +Example: isympy -p no + +\fIENCODING\fR must be one of 'unicode', +\&'ascii' or 'no'. +.TP +\*(T<\fB\-t \fR\*(T>\fITYPE\fR, \*(T<\fB\-\-types=\fR\*(T>\fITYPE\fR +Setup the ground types for the polys. By default, gmpy ground types +are used if gmpy2 or gmpy is installed, otherwise it falls back to python +ground types, which are a little bit slower. You can manually +choose python ground types even if gmpy is installed (e.g., for testing purposes). + +Note that sympy ground types are not supported, and should be used +only for experimental purposes. + +Note that the gmpy1 ground type is primarily intended for testing; it the +use of gmpy even if gmpy2 is available. + +This is the same as setting the environment variable +SYMPY_GROUND_TYPES to the given ground type (e.g., +SYMPY_GROUND_TYPES='gmpy') + +The ground types can be determined interactively from the variable +sympy.polys.domains.GROUND_TYPES inside the isympy shell itself. + +Example: isympy -t python + +\fITYPE\fR must be one of 'gmpy', +\&'gmpy1' or 'python'. +.TP +\*(T<\fB\-o \fR\*(T>\fIORDER\fR, \*(T<\fB\-\-order=\fR\*(T>\fIORDER\fR +Setup the ordering of terms for printing. The default is lex, which +orders terms lexicographically (e.g., x**2 + x + 1). You can choose +other orderings, such as rev-lex, which will use reverse +lexicographic ordering (e.g., 1 + x + x**2). + +Note that for very large expressions, ORDER='none' may speed up +printing considerably, with the tradeoff that the order of the terms +in the printed expression will have no canonical order + +Example: isympy -o rev-lax + +\fIORDER\fR must be one of 'lex', 'rev-lex', 'grlex', +\&'rev-grlex', 'grevlex', 'rev-grevlex', 'old', or 'none'. +.TP +\*(T<\fB\-q\fR\*(T>, \*(T<\fB\-\-quiet\fR\*(T> +Print only Python's and SymPy's versions to stdout at startup, and nothing else. +.TP +\*(T<\fB\-d\fR\*(T>, \*(T<\fB\-\-doctest\fR\*(T> +Use the same format that should be used for doctests. This is +equivalent to '\fIisympy -c python -p no\fR'. +.TP +\*(T<\fB\-C\fR\*(T>, \*(T<\fB\-\-no\-cache\fR\*(T> +Disable the caching mechanism. Disabling the cache may slow certain +operations down considerably. This is useful for testing the cache, +or for benchmarking, as the cache can result in deceptive benchmark timings. + +This is the same as setting the environment variable SYMPY_USE_CACHE +to 'no'. +.TP +\*(T<\fB\-a\fR\*(T>, \*(T<\fB\-\-auto\fR\*(T> +Automatically create missing symbols. Normally, typing a name of a +Symbol that has not been instantiated first would raise NameError, +but with this option enabled, any undefined name will be +automatically created as a Symbol. This only works in IPython 0.11. + +Note that this is intended only for interactive, calculator style +usage. In a script that uses SymPy, Symbols should be instantiated +at the top, so that it's clear what they are. + +This will not override any names that are already defined, which +includes the single character letters represented by the mnemonic +QCOSINE (see the "Gotchas and Pitfalls" document in the +documentation). You can delete existing names by executing "del +name" in the shell itself. You can see if a name is defined by typing +"'name' in globals()". + +The Symbols that are created using this have default assumptions. +If you want to place assumptions on symbols, you should create them +using symbols() or var(). + +Finally, this only works in the top level namespace. So, for +example, if you define a function in isympy with an undefined +Symbol, it will not work. +.TP +\*(T<\fB\-D\fR\*(T>, \*(T<\fB\-\-debug\fR\*(T> +Enable debugging output. This is the same as setting the +environment variable SYMPY_DEBUG to 'True'. The debug status is set +in the variable SYMPY_DEBUG within isympy. +.TP +-- \fIPYTHONOPTIONS\fR +These options will be passed on to \fIipython (1)\fR shell. +Only supported when ipython is being used (standard python shell not supported). + +Two dashes (--) are required to separate \fIPYTHONOPTIONS\fR +from the other isympy options. + +For example, to run iSymPy without startup banner and colors: + +isympy -q -c ipython -- --colors=NoColor +.TP +\*(T<\fB\-h\fR\*(T>, \*(T<\fB\-\-help\fR\*(T> +Print help output and exit. +.TP +\*(T<\fB\-v\fR\*(T>, \*(T<\fB\-\-version\fR\*(T> +Print isympy version information and exit. +.SH FILES +.TP +\*(T<\fI${HOME}/.sympy\-history\fR\*(T> +Saves the history of commands when using the python +shell as backend. +.SH BUGS +The upstreams BTS can be found at \(lahttps://github.com/sympy/sympy/issues\(ra +Please report all bugs that you find in there, this will help improve +the overall quality of SymPy. +.SH "SEE ALSO" +\fBipython\fR(1), \fBpython\fR(1) diff --git a/.venv2/share/man/man1/ttx.1 b/.venv2/share/man/man1/ttx.1 new file mode 100644 index 0000000..9a65edf --- /dev/null +++ b/.venv2/share/man/man1/ttx.1 @@ -0,0 +1,225 @@ +.Dd May 18, 2004 +.\" ttx is not specific to any OS, but contrary to what groff_mdoc(7) +.\" seems to imply, entirely omitting the .Os macro causes 'BSD' to +.\" be used, so I give a zero-width space as its argument. +.Os \& +.\" The "FontTools Manual" argument apparently has no effect in +.\" groff 1.18.1. I think it is a bug in the -mdoc groff package. +.Dt TTX 1 "FontTools Manual" +.Sh NAME +.Nm ttx +.Nd tool for manipulating TrueType and OpenType fonts +.Sh SYNOPSIS +.Nm +.Bk +.Op Ar option ... +.Ek +.Bk +.Ar file ... +.Ek +.Sh DESCRIPTION +.Nm +is a tool for manipulating TrueType and OpenType fonts. It can convert +TrueType and OpenType fonts to and from an +.Tn XML Ns -based format called +.Tn TTX . +.Tn TTX +files have a +.Ql .ttx +extension. +.Pp +For each +.Ar file +argument it is given, +.Nm +detects whether it is a +.Ql .ttf , +.Ql .otf +or +.Ql .ttx +file and acts accordingly: if it is a +.Ql .ttf +or +.Ql .otf +file, it generates a +.Ql .ttx +file; if it is a +.Ql .ttx +file, it generates a +.Ql .ttf +or +.Ql .otf +file. +.Pp +By default, every output file is created in the same directory as the +corresponding input file and with the same name except for the +extension, which is substituted appropriately. +.Nm +never overwrites existing files; if necessary, it appends a suffix to +the output file name before the extension, as in +.Pa Arial#1.ttf . +.Ss "General options" +.Bl -tag -width ".Fl t Ar table" +.It Fl h +Display usage information. +.It Fl d Ar dir +Write the output files to directory +.Ar dir +instead of writing every output file to the same directory as the +corresponding input file. +.It Fl o Ar file +Write the output to +.Ar file +instead of writing it to the same directory as the +corresponding input file. +.It Fl v +Be verbose. Write more messages to the standard output describing what +is being done. +.It Fl a +Allow virtual glyphs ID's on compile or decompile. +.El +.Ss "Dump options" +The following options control the process of dumping font files +(TrueType or OpenType) to +.Tn TTX +files. +.Bl -tag -width ".Fl t Ar table" +.It Fl l +List table information. Instead of dumping the font to a +.Tn TTX +file, display minimal information about each table. +.It Fl t Ar table +Dump table +.Ar table . +This option may be given multiple times to dump several tables at +once. When not specified, all tables are dumped. +.It Fl x Ar table +Exclude table +.Ar table +from the list of tables to dump. This option may be given multiple +times to exclude several tables from the dump. The +.Fl t +and +.Fl x +options are mutually exclusive. +.It Fl s +Split tables. Dump each table to a separate +.Tn TTX +file and write (under the name that would have been used for the output +file if the +.Fl s +option had not been given) one small +.Tn TTX +file containing references to the individual table dump files. This +file can be used as input to +.Nm +as long as the referenced files can be found in the same directory. +.It Fl i +.\" XXX: I suppose OpenType programs (exist and) are also affected. +Don't disassemble TrueType instructions. When this option is specified, +all TrueType programs (glyph programs, the font program and the +pre-program) are written to the +.Tn TTX +file as hexadecimal data instead of +assembly. This saves some time and results in smaller +.Tn TTX +files. +.It Fl y Ar n +When decompiling a TrueType Collection (TTC) file, +decompile font number +.Ar n , +starting from 0. +.El +.Ss "Compilation options" +The following options control the process of compiling +.Tn TTX +files into font files (TrueType or OpenType): +.Bl -tag -width ".Fl t Ar table" +.It Fl m Ar fontfile +Merge the input +.Tn TTX +file +.Ar file +with +.Ar fontfile . +No more than one +.Ar file +argument can be specified when this option is used. +.It Fl b +Don't recalculate glyph bounding boxes. Use the values in the +.Tn TTX +file as is. +.El +.Sh "THE TTX FILE FORMAT" +You can find some information about the +.Tn TTX +file format in +.Pa documentation.html . +In particular, you will find in that file the list of tables understood by +.Nm +and the relations between TrueType GlyphIDs and the glyph names used in +.Tn TTX +files. +.Sh EXAMPLES +In the following examples, all files are read from and written to the +current directory. Additionally, the name given for the output file +assumes in every case that it did not exist before +.Nm +was invoked. +.Pp +Dump the TrueType font contained in +.Pa FreeSans.ttf +to +.Pa FreeSans.ttx : +.Pp +.Dl ttx FreeSans.ttf +.Pp +Compile +.Pa MyFont.ttx +into a TrueType or OpenType font file: +.Pp +.Dl ttx MyFont.ttx +.Pp +List the tables in +.Pa FreeSans.ttf +along with some information: +.Pp +.Dl ttx -l FreeSans.ttf +.Pp +Dump the +.Sq cmap +table from +.Pa FreeSans.ttf +to +.Pa FreeSans.ttx : +.Pp +.Dl ttx -t cmap FreeSans.ttf +.Sh NOTES +On MS\-Windows and MacOS, +.Nm +is available as a graphical application to which files can be dropped. +.Sh SEE ALSO +.Pa documentation.html +.Pp +.Xr fontforge 1 , +.Xr ftinfo 1 , +.Xr gfontview 1 , +.Xr xmbdfed 1 , +.Xr Font::TTF 3pm +.Sh AUTHORS +.Nm +was written by +.An -nosplit +.An "Just van Rossum" Aq just@letterror.com . +.Pp +This manual page was written by +.An "Florent Rougon" Aq f.rougon@free.fr +for the Debian GNU/Linux system based on the existing FontTools +documentation. It may be freely used, modified and distributed without +restrictions. +.\" For Emacs: +.\" Local Variables: +.\" fill-column: 72 +.\" sentence-end: "[.?!][]\"')}]*\\($\\| $\\| \\| \\)[ \n]*" +.\" sentence-end-double-space: t +.\" End: \ No newline at end of file diff --git a/MEETING_PULL_SUMMARY_2026-03-25.md b/MEETING_PULL_SUMMARY_2026-03-25.md new file mode 100644 index 0000000..16d548c --- /dev/null +++ b/MEETING_PULL_SUMMARY_2026-03-25.md @@ -0,0 +1,171 @@ +# Pull Discussion Summary + +Comparison basis: +- Local working tree on `main` at `bf2d9e2` +- Upstream `origin/main` at `d4a40be` +- Date: 2026-03-25 + +This note summarizes the main code changes relative to the current upstream `main`, why they were made, and what still needs discussion before deciding whether to pull this work. + +## What Changed at a High Level + +- The codebase was moved from mostly global storage assumptions to a project-aware model. +- Ingestion, retrieval, logs, graph snapshots, and audit outputs can now be scoped to a selected document folder. +- The Gradio app was updated to work with that project-aware model instead of assuming one global storage location. +- Graph pickle support was restored and split into two roles: + - `kg.pkl` for the editable working graph in the UI + - `kg_retrieval.pkl` for the retrieval snapshot built from canonical storage +- Structured JSON logging was added for question-answer interactions. +- Extraction was extended to use language information from documents and chunks. +- Clean-ingestion stability on this Windows/Python 3.12 environment was improved by keeping retrieval operational even when native Chroma vector operations are unstable. + +## Project-Aware Ingestion and Storage + +The main architectural change is that a selected document folder is now treated as a project root for artifacts. + +Indented example: + +```text +C:\path\to\documents\ + report.pdf + notes.txt + .appl-kgraph\ + storage\ + documents.sqlite + chunks.sqlite + graph.sqlite + chroma_chunks\ + chroma_entities\ + chroma_relations\ + knowledge_graph\ + kg.pkl + kg_retrieval.pkl + logs\ + ingestion.log + pathrag.log + lightrag.log + qa\ + 20260325T....json + audits\ + extraction\ + report.audit.json +``` + +Why this matters: +- Separate corpora no longer have to share one global storage directory. +- The UI can later select a project cleanly because the storage layout already supports it. +- Logs, graph artifacts, and audits stay attached to the document set that produced them. + +## Most Important Code Changes and Reasons + +- `graph/project_paths.py` + - Added a single resolver for turning a document folder into project-local storage, logs, audits, and knowledge-graph paths. + - This is the main seam that makes per-project isolation work. + +- `graph/ingestion.py` + - Ingestion now accepts a project/document root and writes to project-local storage. + - Added structured progress messages for the UI during ingestion. + - Added retrieval-snapshot writing after ingestion. + - Added hooks for audit output and more explicit per-file processing stages. + +- `graph/app.py` + - The Gradio app now follows the project-aware storage model. + - Ingestion status is streamed live in the sidebar instead of only showing a final result. + - The knowledge graph rendering was fixed to use an iframe wrapper so the PyVis graph actually displays. + - The earlier graph editing tools were restored: + - save working graph + - load saved graph + - edit node + - merge nodes + - The app now distinguishes between: + - the editable working graph pickle + - the retrieval graph snapshot + +- `graph/graph_pickle.py` + - Added helper functions for loading, saving, and rebuilding graph pickles from canonical storage. + - This keeps pickle logic out of the UI and retriever code. + +- `graph/pathrag.py` and `graph/lightrag.py` + - Retrieval now loads the graph from the project-specific retrieval snapshot when available. + - Both retrievers now log question-answer interactions to structured JSON files. + - Retrieval was adjusted to work with the project-aware storage model instead of a single global storage root. + +- `graph/query_logging.py` + - Added JSON query logging for QA sessions. + - This is useful for debugging, auditability, and reviewing retrieval context after the fact. + +- `graph/fileparser.py` + - Text-file parsing was changed to avoid depending on `chardet` for normal `.txt` ingestion. + - This fixed a real failure seen while ingesting `docs2\UDHR_first_article_all.txt`. + +- `graph/extractor.py`, `graph/prompts.py`, and `graph/utils.py` + - Extraction prompting now has better language handling and cleaner prompt wiring. + - Utility and prompt behavior is more centralized than before. + +- `graph/db_storage.py` + - Storage was extended to support project-aware paths cleanly. + - A SQL-backed similarity fallback was added for environments where native Chroma vector operations are unstable. + - On the current Windows/Python 3.12 setup, that fallback is now also used to avoid clean-ingestion crashes during Chroma vector writes. + +- `graph/chunker.py` + - Overlap behavior was tightened so it does not keep compounding prior overlap unintentionally. + +- `graph/settings.py` + - Settings were expanded for project layout, logging split, and extraction toggles. + - This is one of the larger refactors in the diff and is functionally useful, but also one of the noisier files to review. + +- `graph/logging_utils.py` + - Added centralized file logger setup so ingestion and retrieval logs can be scoped per project. + +- Tests + - Added coverage for project-path resolution, graph pickle helpers, and chunk overlap behavior. + - Repaired the existing PathRAG storage adapter test. + +## New Files Added + +- `graph/project_paths.py` +- `graph/query_logging.py` +- `graph/graph_pickle.py` +- `graph/logging_utils.py` +- `test/test_project_paths.py` +- `test/test_graph_pickle.py` +- `test/test_chunker.py` + +## Why the Changes Are Good + +- The system is easier to reason about because project selection now maps directly to storage layout. +- The UI and backend are more aligned than before. +- Retrieval and ingestion leave behind much better operational traces through logs, QA JSON files, and graph snapshots. +- The graph editing workflow was preserved instead of being lost during the project-scoping changes. +- The codebase is better positioned for future UI project selection without another storage redesign. + +## Open Issues and Discussion Points + +- Native Chroma stability on Windows/Python 3.12 + - On this machine, native Chroma query and native Chroma vector writes both proved unstable. + - The current solution keeps the app working by relying on SQL-backed similarity instead. + - This is practical, but it is still a workaround and should be discussed explicitly. + +- Performance implications of the fallback + - The SQL-backed similarity path is slower than native Chroma ANN search on larger corpora. + - For current functionality it works, but it is not the ideal long-term path if Chroma can be stabilized. + +- Duplicate basename handling during ingestion + - Some ingestion logic still keys documents by filename rather than full filepath. + - Two files with the same name in different subfolders of one project can still collide. + +- Second-pass extraction behavior + - There is audit support, but second-pass findings are still not cleanly positioned as graph-augmenting extraction. + - This should be clarified before calling the extraction flow final. + +- Working graph vs canonical graph + - `kg.pkl` is still a working UI artifact. + - UI graph edits do not currently sync back into canonical retrieval storage. + - That separation is intentional, but it should be understood by everyone discussing the pull. + +- Size and reviewability of the diff + - `app.py`, `settings.py`, `extractor.py`, and `db_storage.py` carry a large share of the churn. + - Functionally, many of the changes are reasonable. + - Review-wise, this may still be easier to land if split into smaller pull requests. + +# diff --git a/README.md b/README.md index 3a6ba9a..a8740dd 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # appl-kgraph +This project is being developed with **Python 3.12** as its intended local environment and runtime baseline. + **appl-kgraph** is a modular **graph-based Retrieval-Augmented Generation (RAG)** system with pluggable retrieval strategies. It is designed for querying large document collections by combining classic vector-based retrieval with an explicit **knowledge graph** over entities and relations extracted from the source texts. The project builds on ideas from recent graph-based RAG research, most notably **LightRAG** and **PathRAG**, and provides a shared graph + vector indexing layer on top of which multiple retrieval strategies can be implemented, compared, and extended. @@ -129,18 +131,20 @@ If your document collection includes `.docx` files, ensure that **Microsoft Word 1. Open a terminal (Anaconda Prompt or standard shell) 2. Navigate to the project root -3. Create a virtual environment: +3. Verify that your active interpreter is Python 3.12 before installing dependencies. +4. Create a virtual environment with Python 3.12: + ```text + Windows: py -3.12 -m venv venv + macOS/Linux: python3.12 -m venv venv ``` - python -m venv venv - ``` -4. Activate it: +5. Activate it: * Windows: `venv\Scripts\activate` * macOS/Linux: `source venv/bin/activate` -5. Install dependencies: +6. Install dependencies: - ``` + ```text pip install -r requirements.txt ``` @@ -150,8 +154,9 @@ If your document collection includes `.docx` files, ensure that **Microsoft Word From the project root, run: -``` -python graph/main.py +```text +Windows: py -3.12 graph/main.py +macOS/Linux: python3.12 graph/main.py ``` This will execute the current end-to-end pipeline using the configured retrieval strategy. @@ -177,5 +182,3 @@ For questions, feedback, or collaboration inquiries, you can contact the maintai 📧 Contact link - - diff --git a/docs/.appl-kgraph/knowledge_graph/kg.pkl b/docs/.appl-kgraph/knowledge_graph/kg.pkl new file mode 100644 index 0000000..42fb95d Binary files /dev/null and b/docs/.appl-kgraph/knowledge_graph/kg.pkl differ diff --git a/docs/.appl-kgraph/knowledge_graph/kg_retrieval.pkl b/docs/.appl-kgraph/knowledge_graph/kg_retrieval.pkl new file mode 100644 index 0000000..3b534f5 Binary files /dev/null and b/docs/.appl-kgraph/knowledge_graph/kg_retrieval.pkl differ diff --git a/docs2/.appl-kgraph/knowledge_graph/kg.pkl b/docs2/.appl-kgraph/knowledge_graph/kg.pkl new file mode 100644 index 0000000..f9d2ee5 Binary files /dev/null and b/docs2/.appl-kgraph/knowledge_graph/kg.pkl differ diff --git a/docs2/.appl-kgraph/knowledge_graph/kg_retrieval.pkl b/docs2/.appl-kgraph/knowledge_graph/kg_retrieval.pkl new file mode 100644 index 0000000..e3da44d Binary files /dev/null and b/docs2/.appl-kgraph/knowledge_graph/kg_retrieval.pkl differ diff --git a/docs_store/UDHR_first_article_all.txt b/docs2/UDHR_first_article_all.txt similarity index 100% rename from docs_store/UDHR_first_article_all.txt rename to docs2/UDHR_first_article_all.txt diff --git a/docs2/meeting-change-summary-2026-03-18.pdf b/docs2/meeting-change-summary-2026-03-18.pdf new file mode 100644 index 0000000..b9be681 Binary files /dev/null and b/docs2/meeting-change-summary-2026-03-18.pdf differ diff --git a/graph/app.py b/graph/app.py index 46e54c9..c7ce506 100644 --- a/graph/app.py +++ b/graph/app.py @@ -1,713 +1,570 @@ -import asyncio +from __future__ import annotations + +import html +import queue +import tempfile +import threading +from pathlib import Path +from typing import Any, Iterator, List, Optional, Tuple + import gradio as gr +import matplotlib.colors as mcolors +import matplotlib.pyplot as plt import networkx as nx from pyvis.network import Network -import os, json, pickle, random, glob, tempfile, base64 -from typing import Optional, List, Tuple -import matplotlib.pyplot as plt -import matplotlib.colors as mcolors -from pathlib import Path -from win32api import GetSystemMetrics -from gradio import themes -# local imports -from fileparser import FileParser -from ingestion_app import ingest_paths -from pathrag import PathRAG, render_full_context -from lightrag import LightRAG, render_full_context -import settings + +from ingestion import ingest_paths +from lightrag import LightRAG +from pathrag import PathRAG, StorageAdapter as PathStorageAdapter +from graph_pickle import load_graph_from_pickle, save_graph_to_pickle +from project_paths import ( + ProjectPaths, + ensure_project_dirs, + list_document_paths, + resolve_project_paths, +) # ===================================================== # -------- INITIAL RANDOM GRAPH CREATION -------------- # ===================================================== + mygraph = nx.Graph() -# COLOR_MODE = "type" +_PATHRAG_CACHE: dict[str, PathRAG] = {} +_LIGHTRAG_CACHE: dict[str, LightRAG] = {} # ===================================================== # -------- DYNAMIC COLOR GENERATION ------------------- # ===================================================== def generate_dynamic_type_colors(graph): types = sorted(set(data.get("type", "Unknown") for _, data in graph.nodes(data=True))) + if not types: + return {} cmap = plt.get_cmap("tab20", len(types)) - return {t: mcolors.to_hex(cmap(i)) for i, t in enumerate(types)} + return {node_type: mcolors.to_hex(cmap(index)) for index, node_type in enumerate(types)} -SOURCE_COLORS = { - "User": "#17becf", "System": "#bcbd22", "External": "#e377c2", "Default": "#7f7f7f" -} - -# ===================================================== -# -------- GRAPH RENDERING ---------------------------- -# ===================================================== -# def render_graph_iframe(graph, color_mode="type", height_px=600): -def render_graph_iframe(graph, height_px=600): - TYPE_COLORS = generate_dynamic_type_colors(graph) - - # net = Network(height=f"{height_px}px", width="100%", directed=False, bgcolor="#111111", select_menu=True, filter_menu=True) - net = Network(height=f"{height_px}px", width="100%", directed=False, bgcolor="#111111") - # net.set_options(""" - # var options = { - # "nodes": {"font": {"color": "white","size":10,"face":"arial"}, "borderWidth": 2}, - # "edges": {"color":{"color":"#AAAAAA"}, "smooth": false} - # } - # """) - - # # Important: disable physics BEFORE adding the graph - # net.barnes_hut() # optional layout settings - # net.toggle_physics(False) - - # Load the graph directly (fast) - graph_path = os.path.join("knowledge_graph", "kg.pkl") - with open(graph_path, 'rb') as f: - graph = pickle.load(f) - net.from_nx(graph) - - # for node, data in graph.nodes(data=True): - # node_label = data.get("label", "Unknown") - # node_type = data.get("type", "Unknown") - # node_source = data.get("source", "Unknown") - # node_description = data.get("description", "Unknown") - # # if color_mode == "type": - # color = TYPE_COLORS.get(node_type, "#7f7f7f") - # # else: - # # color = SOURCE_COLORS.get(node_source, SOURCE_COLORS["Default"]) - # title = f"Label: {node_label}, Type: {node_type}, Source: {node_source}, Description: {node_description}" - # net.add_node(node, label=data.get("label", node), title=title, color=color) - - # for u, v, data in graph.edges(data=True): - # rel = data.get("relation_type", "relation") - # desc = data.get("relation_description", "") - # weight = data.get("weight", 1) - # net.add_edge(u, v, title=f"{rel}, Description: {desc}, Weight: {weight}", value=weight) - - net.set_options(""" - var options = { - "nodes": { - "borderWidth": 0, - "borderWidthSelected": 4, - "opacity": 1, - "fixed": { - "x": true, - "y": true - }, - "font": { - "strokeWidth": 10 - }, - "size": 14 - }, - "edges": { - "arrows": { - "middle": { - "enabled": true - } - }, - "selfReferenceSize": null, - "selfReference": { - "angle": 0.7853981633974483 - }, - "smooth": { - "forceDirection": "none" - } - }, - "interaction": { - "hover": true, - "multiselect": true, - "navigationButtons": true - }, - "manipulation": { - "enabled": true, - "initiallyActive": true - }, - "physics": { - "enabled": false, - "minVelocity": 0.75 - } - } - """) - - # net.show_buttons(filter_=['physics']) - # net.show_buttons() - - # net.from_nx(graph) - # # Define the custom JS code for hover effect - # hover_code = """ - # var highlighted = []; - # var neighbors = []; - - # // Event for hovering over nodes - # network.on('hoverNode', function(params) { - # var node_id = params.node; - - # // Get the node data - # highlighted = []; - # neighbors = []; - - # // Highlight the hovered node - # highlighted.push(node_id); - - # // Get neighbors of the hovered node - # var connectedNodes = network.getConnectedNodes(node_id); - # neighbors = connectedNodes; - - # // Style update for highlighting nodes - # network.nodes.forEach(function(node) { - # if (highlighted.includes(node.id)) { - # node.color = {background: 'orange', border: 'black'}; - # node.font = {color: 'white'}; - # } else if (neighbors.includes(node.id)) { - # node.color = {background: 'yellow', border: 'black'}; - # } else { - # node.color = {background: 'gray', border: 'gray'}; - # node.font = {color: 'gray'}; - # } - # }); - - # // Style update for edges - # network.edges.forEach(function(edge) { - # if (highlighted.includes(edge.from) && highlighted.includes(edge.to)) { - # edge.color = 'orange'; - # edge.width = 4; - # } else if (neighbors.includes(edge.from) || neighbors.includes(edge.to)) { - # edge.color = 'yellow'; - # edge.width = 2; - # } else { - # edge.color = 'gray'; - # edge.width = 1; - # } - # }); - - # // Refresh the network to apply the styles - # network.redraw(); - # }); - - # // Reset on mouseout - # network.on('blurNode', function() { - # network.nodes.forEach(function(node) { - # node.color = {background: 'lightblue', border: 'black'}; - # node.font = {color: 'black'}; - # }); - - # network.edges.forEach(function(edge) { - # edge.color = 'black'; - # edge.width = 2; - # }); - - # network.redraw(); - # }); - # """ - - # # Define the custom JS code for click effect - # click_code = """ - # var highlighted = []; - # var neighbors = []; - - # // Event for clicking on nodes - # network.on('selectNode', function(params) { - # var node_id = params.nodes[0]; - - # // Get the node data - # highlighted = []; - # neighbors = []; - - # // Highlight the clicked node - # highlighted.push(node_id); - - # // Get neighbors of the clicked node - # var connectedNodes = network.getConnectedNodes(node_id); - # neighbors = connectedNodes; - - # // Style update for highlighting nodes - # network.nodes.forEach(function(node) { - # if (highlighted.includes(node.id)) { - # node.color = {background: 'orange', border: 'black'}; - # node.font = {color: 'white'}; - # } else if (neighbors.includes(node.id)) { - # node.color = {background: 'yellow', border: 'black'}; - # } else { - # node.color = {background: 'gray', border: 'gray'}; - # node.font = {color: 'gray'}; - # } - # }); - - # // Style update for edges - # network.edges.forEach(function(edge) { - # if (highlighted.includes(edge.from) && highlighted.includes(edge.to)) { - # edge.color = 'orange'; - # edge.width = 4; - # } else if (neighbors.includes(edge.from) || neighbors.includes(edge.to)) { - # edge.color = 'yellow'; - # edge.width = 2; - # } else { - # edge.color = 'gray'; - # edge.width = 1; - # } - # }); - - # // Refresh the network to apply the styles - # network.redraw(); - # }); - - # // Reset the styling when clicking anywhere outside a node - # network.on('deselectNode', function() { - # network.nodes.forEach(function(node) { - # node.color = {background: 'lightblue', border: 'black'}; - # node.font = {color: 'black'}; - # }); - - # network.edges.forEach(function(edge) { - # edge.color = 'black'; - # edge.width = 2; - # }); - - # network.redraw(); - # }); - # """ - - - # # Add the click event script - # net.set_options(click_code) - - # net.show("network_click.html") - tmp_path = os.path.join(tempfile.gettempdir(), "graph.html") - net.save_graph(tmp_path) - with open(tmp_path, "r", encoding="utf-8") as f: - html_content = f.read() - b64 = base64.b64encode(html_content.encode("utf-8")).decode("utf-8") - return f'' -# ===================================================== -# -------- GRAPH INTERACTION -------------------------- -# ===================================================== -# Function to handle node click event -def node_click(node_id): - # Return node details - node_details = { - 1: "Node 1: This is Node 1\nColor: Red\nSize: 15", - 2: "Node 2: This is Node 2\nColor: Green\nSize: 15", - 3: "Node 3: This is Node 3\nColor: Blue\nSize: 15" - } - - # Return the details of the clicked node (or a default message if node not found) - return node_details.get(node_id, "Node details not found.") - -# ===================================================== -# -------- LEGEND ------------------------------------- -# ===================================================== -# def generate_legend_html(color_mode="type", graph=None): -def generate_legend_html(graph=None): - # if color_mode == "type": - COLORS = generate_dynamic_type_colors(graph) - # else: - # COLORS = SOURCE_COLORS +def generate_legend_html(graph: Optional[nx.Graph] = None) -> str: + colors = generate_dynamic_type_colors(graph or nx.Graph()) html = "
Legend:
" - for key, color in COLORS.items(): - html += f"
" - html += f"
{key}
" + for key, color in colors.items(): + html += "
" + html += ( + f"
" + f"{key}
" + ) html += "
" return html -# ===================================================== -# -------- NODE OPERATIONS ---------------------------- -# ===================================================== -# def merge_nodes(node1, node2, color_mode): -def merge_nodes(node1, node2): - if node1 not in mygraph or node2 not in mygraph: - # return render_graph_iframe(mygraph, color_mode), generate_legend_html(color_mode, mygraph), "⚠️ Both nodes must exist." - return render_graph_iframe(mygraph), generate_legend_html(mygraph), "⚠️ Both nodes must exist." - if node1 == node2: - # return render_graph_iframe(mygraph, color_mode), generate_legend_html(color_mode, mygraph), "⚠️ Cannot merge same node." - return render_graph_iframe(mygraph), generate_legend_html(mygraph), "⚠️ Cannot merge same node." - # define new node: labels are concatenated - new_node = f"{node1}_{node2}" - mygraph.add_node(new_node, label=f"{mygraph.nodes[node1].get('label','')} + {mygraph.nodes[node2].get('label','')}", type="Merged", description=f"Merged from {node1} and {node2}", source="Merged") - # after creating the new merged node, assign the old edges to the new node and delete the old nodes - for n in [node1, node2]: - for neighbor, attrs in list(mygraph[n].items()): - if neighbor not in [node1, node2]: - mygraph.add_edge(new_node, neighbor, **attrs) - mygraph.remove_node(n) - # return render_graph_iframe(mygraph, color_mode), generate_legend_html(color_mode, mygraph), f"🔄 Merged '{node1}'+'{node2}' → '{new_node}'." - return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"🔄 Merged '{node1}'+'{node2}' → '{new_node}'." +def _project_paths_for_folder(folder_path: str) -> Optional[ProjectPaths]: + if not folder_path: + return None + return resolve_project_paths(folder_path) -# def update_node_attributes(node_id, new_label, new_type, new_desc, new_source, color_mode): -def update_node_attributes(node_id, new_label, new_type, new_desc, new_source): - if node_id not in mygraph: - # return render_graph_iframe(mygraph, color_mode), generate_legend_html(color_mode, mygraph), f"⚠️ Node '{node_id}' not found." - return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"⚠️ Node '{node_id}' not found." - if new_label: - mygraph.nodes[node_id]['label'] = new_label - if new_type: - mygraph.nodes[node_id]['type'] = new_type - if new_desc: - mygraph.nodes[node_id]['description'] = new_desc - if new_source: - mygraph.nodes[node_id]['source'] = new_source - # return render_graph_iframe(mygraph, color_mode), generate_legend_html(color_mode, mygraph), f"✏️ Node '{node_id}' updated." - return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"✏️ Node '{node_id}' updated." -# ===================================================== -# -------- SAVE GRAPH --------------------------------- -# ===================================================== -# def save_graph_json(): -# data = { -# "nodes": [dict(id=n, **d) for n, d in G.nodes(data=True)], -# "links": [dict(source=u, target=v, **d) for u, v, d in G.edges(data=True)] -# } -# with open("graph.json", "w", encoding="utf-8") as f: -# json.dump(data, f, indent=2) -# return "✅ Graph saved as graph.json" - -def save_graph_pickle(): - with open("graph.pkl", "wb") as f: - pickle.dump(mygraph, f) - return "✅ Graph saved as graph.pkl" - - -def handle_ingestion(folder_path): - # ===================================================== - # -------- FOLDER-BASED DOCUMENT SELECTION ------------ - # ===================================================== - # Initialize an empty list to store status messages - status_messages = [] - if not folder_path or not os.path.isdir(folder_path): - yield None, "⚠️ Please enter a valid folder path." - else: - paths = [Path(os.path.join(folder_path, f)) for f in os.listdir(folder_path) - if ((os.path.isfile(os.path.join(folder_path, f))) and (Path(f).suffix in settings.VALID_EXTENSIONS))] - if not paths: - yield None, f"📂 No files found with extensions: {', '.join(settings.VALID_EXTENSIONS)}." - else: - # Iterate over the generator returned by process_files - for graph, status in ingest_paths(paths): - status_messages.append(status) - # Yield the current status messages - yield graph, "\n".join(status_messages) - - -async def create_pathrag_response(question: str, chat_history: List[gr.ChatMessage]) -> Tuple[str, List[gr.ChatMessage], str]: - """ - A response is obtained from Pathrag knowledge graph retrieval - - Parameters - ---------- - question : str - the currently user submitted question - chat_history : List[gr.ChatMessage] - a list of historical chatmessages, alternating between user and assistant - - Returns - ------- - List[gr.ChatMessage] - the updated list of historical chatmessages, alternating between user and assistant - """ - chat_history.append(gr.ChatMessage(role="user", content=question)) - pathrag = PathRAG( - system_prompt="" +def _load_graph_from_storage(folder_path: str) -> nx.Graph: + if not folder_path: + return nx.Graph() + project_paths = resolve_project_paths(folder_path) + if not Path(project_paths.storage.graph_db).exists(): + return nx.Graph() + adapter = PathStorageAdapter(paths=project_paths.storage) + return adapter.graph.copy() + + +def _load_graph_from_pickle(folder_path: str) -> nx.Graph: + project_paths = _project_paths_for_folder(folder_path) + if project_paths is None or not project_paths.graph_pickle_file.exists(): + return nx.Graph() + return load_graph_from_pickle(project_paths.graph_pickle_file) + + +def _save_graph_pickle(folder_path: str, graph: nx.Graph) -> Optional[Path]: + project_paths = _project_paths_for_folder(folder_path) + if project_paths is None: + return None + ensure_project_dirs(project_paths) + return save_graph_to_pickle(graph, project_paths.graph_pickle_file) + + +def render_graph_iframe(graph: nx.Graph, height_px: int = 650) -> str: + type_colors = generate_dynamic_type_colors(graph) + net = Network(height=f"{height_px}px", width="100%", directed=False, bgcolor="#111111", font_color="white") + for node, data in graph.nodes(data=True): + node_type = data.get("type", "unknown") + title = f"type={node_type}\n{data.get('description', '')}" + color = type_colors.get(node_type, "#0EA5E9") + label = str(data.get("label", node)) + net.add_node(str(node), label=label, title=title, color=color) + + for source, target, data in graph.edges(data=True): + title = data.get("description", "") or data.get("keywords", "") + net.add_edge(str(source), str(target), title=title, value=float(data.get("weight", 1.0) or 1.0)) + + net.set_options( + """ + var options = { + "nodes": { "shape": "dot", "size": 14, "font": { "strokeWidth": 8 } }, + "edges": { "smooth": false, "color": { "color": "#94A3B8" } }, + "physics": { "enabled": false }, + "interaction": { "hover": true, "navigationButtons": true } + } + """ ) - result = await pathrag.aretrieve(question) - - # print(f"Pathrag result = {result}") - - # chat_history.append(gr.ChatMessage(role="assistant", content=result.answer, metadata={"title": "Processing..."})) - chat_history.append(gr.ChatMessage(role="assistant", content=result.answer)) - # print(f"chat_history = {chat_history}") - # make sure that the msg_input textbox is cleared again and return the updated chat history - lines = "" - if result.chunk_matches: - for i, chunk in enumerate(result.chunk_matches, 1): - head = f"{chunk.filename or chunk.document_id or '(unknown doc)'}" - lines += f"{i}. {head} (score={chunk.score:.3f})\n" - lines += f"{chunk.text}\n" - lines += 46*"-" + "\n\n" - # else: - # lines.append("(none)") - - print() - return "", chat_history, lines - - -async def create_lightrag_response(question: str, chat_history: List[gr.ChatMessage]) -> Tuple[str, List[gr.ChatMessage], str]: - """ - A response is obtained from Lightrag knowledge graph retrieval - - Parameters - ---------- - question : str - the currently user submitted question - chat_history : List[gr.ChatMessage] - a list of historical chatmessages, alternating between user and assistant - - Returns - ------- - List[gr.ChatMessage] - the updated list of historical chatmessages, alternating between user and assistant - """ - chat_history.append(gr.ChatMessage(role="user", content=question)) - lightrag = LightRAG( - system_prompt="" + + tmp_path = Path(tempfile.gettempdir()) / "appl_kgraph_graph.html" + net.save_graph(str(tmp_path)) + rendered = tmp_path.read_text(encoding="utf-8") + return ( + f'' ) - result = await lightrag.aretrieve(question) - - # print(f"Lightrag result = {result}") - - # chat_history.append(gr.ChatMessage(role="assistant", content=result.answer, metadata={"title": "Processing..."})) - chat_history.append(gr.ChatMessage(role="assistant", content=result.answer)) - # print(f"chat_history = {chat_history}") - # make sure that the msg_input textbox is cleared again and return the updated chat history - lines = "" - if result.all_chunks: - for i, chunk in enumerate(result.all_chunks, 1): - # head = f"{chunk.filename or chunk.document_id or '(unknown doc)'}" - # lines += f"{i}. {head} (score={chunk.score:.3f})\n" - lines += f"{i}. {chunk['source_type']}" - if chunk['source_type'] == "vector": - lines += f" (score={chunk['score']:.3f})" - lines += "\n" - lines += f"{chunk['text']}\n" - lines += 46*"-" + "\n\n" - - return "", chat_history, lines - -# async def predict(input, history): -# """ -# Predict the response of the chatbot and complete a running list of chat history. -# """ -# history.append({"role": "user", "content": input}) -# response = await create_pathrag_response(history) -# history.append({"role": "assistant", "content": response}) -# messages = [(history[i]["content"], history[i+1]["content"]) for i in range(0, len(history)-1, 2)] -# return messages, history - - - - # query = "Who are the authors of LayoutParser and do they overlap any of the other articles?" - # query = input("Enter your question: ") - # conversation_history = [] - # while query not in ("exit", "quit"): - # print("\n--- PathRAG Response ---\n") - # asyncio.run(graph.main.ask_with_pathrag(query, verbose=True)) - # print("\n---\n") - # print("\n--- LightRAG Response ---\n") - # result = asyncio.run(graph.main.ask_with_lightrag(query, verbose=True)) - # conversation_history.append((query, result.answer)) - # print("\n---\n") - # query = input("Enter your question: ") - - # try: - # file_path = os.path.join(folder_path, "knowledge_graphs", "azureopenai_gpt-4o_azureopenai_text-embedding-ada-002_NLTKTextSplitter_2000_200", "knowledge_graph.pkl") - # with open(file_path, "rb") as f: - # newG = pickle.load(f) - # if not isinstance(newG, nx.Graph): - # raise ValueError("Invalid pickle file.") - # G.clear() - # G.add_nodes_from(newG.nodes(data=True)) - # G.add_edges_from(newG.edges(data=True)) - # msg = f"📦 Loaded Pickle graph from {file_path}" - # # return render_graph_iframe(G, color_mode), generate_legend_html(color_mode, G), msg - # return render_graph_iframe(G), generate_legend_html(G), msg - # except Exception as e: - # # return render_graph_iframe(G, color_mode), generate_legend_html(color_mode, G), f"❌ Load failed: {e}" - # return render_graph_iframe(G), generate_legend_html(G), f"❌ Load failed: {e}" -# ===================================================== -# -------- DROPDOWN UPDATE HELPER --------------------- -# ===================================================== -# def update_dropdowns(): -# nodes = sorted(G.nodes()) -# return [gr.update(choices=nodes) for _ in range(9)] + +def _get_pathrag(folder_path: str) -> PathRAG: + rag = _PATHRAG_CACHE.get(folder_path) + if rag is None: + rag = PathRAG(project_paths=resolve_project_paths(folder_path), system_prompt="") + _PATHRAG_CACHE[folder_path] = rag + return rag + + +def _get_lightrag(folder_path: str) -> LightRAG: + rag = _LIGHTRAG_CACHE.get(folder_path) + if rag is None: + rag = LightRAG(project_paths=resolve_project_paths(folder_path), system_prompt="") + _LIGHTRAG_CACHE[folder_path] = rag + return rag + + +def _history_to_turns(chat_history: List[dict]) -> List[Tuple[str, str]]: + turns: List[Tuple[str, str]] = [] + for message in chat_history or []: + if isinstance(message, dict): + role = message.get("role", "") + content = message.get("content", "") + else: + role = getattr(message, "role", "") + content = getattr(message, "content", "") + if role and content: + turns.append((role, content)) + return turns + + +def _dropdown_choices() -> List[Tuple[str, str]]: + choices: List[Tuple[str, str]] = [] + for node, data in mygraph.nodes(data=True): + label = data.get("label", node) + choices.append((f"{label} ({node})", str(node))) + return sorted(choices, key=lambda item: item[0].lower()) + + def update_dropdowns(): - # Create dropdown entries with label → id mapping - nodes = sorted(mygraph.nodes()) - labeled_nodes = [ - (f"{data.get('label', n)} ({n})", n) - for n, data in mygraph.nodes(data=True) + choices = _dropdown_choices() + return [gr.update(choices=choices, value=None) for _ in range(3)] + + +def _ingestion_payload( + graph: nx.Graph, + status: str, + active_folder_value: str, +) -> Tuple[str, str, str, str, Any, Any, Any]: + updates = update_dropdowns() + return render_graph_iframe(graph), generate_legend_html(graph), status, active_folder_value, *updates + + +def handle_ingestion(folder_path: str) -> Iterator[Tuple[str, str, str, str, Any, Any, Any]]: + global mygraph + + if not folder_path or not Path(folder_path).is_dir(): + yield _ingestion_payload(mygraph, "Please provide a valid folder path.", "") + return + + documents_root = Path(folder_path).expanduser().resolve() + paths = list_document_paths(documents_root) + if not paths: + empty_graph = nx.Graph() + yield _ingestion_payload( + empty_graph, + "No supported files found in the selected folder.", + str(documents_root), + ) + return + + progress_messages = [ + f"Preparing ingestion for {documents_root}", + f"Discovered {len(paths)} supported files", ] - return [gr.update(choices=labeled_nodes) for _ in range(3)] + yield _ingestion_payload(mygraph, "\n".join(progress_messages), str(documents_root)) + + progress_queue: queue.Queue[str] = queue.Queue() + outcome: dict[str, Any] = {} + error: dict[str, Exception] = {} + + def _report_progress(message: str) -> None: + progress_queue.put(message) + + def _run_ingestion() -> None: + try: + outcome["summary"] = ingest_paths( + paths, + documents_root=documents_root, + progress_callback=_report_progress, + ) + except Exception as exc: # pragma: no cover - UI integration guard + error["exception"] = exc + finally: + progress_queue.put("__DONE__") + + worker = threading.Thread(target=_run_ingestion, daemon=True) + worker.start() + + while True: + try: + message = progress_queue.get(timeout=0.2) + except queue.Empty: + continue + + if message == "__DONE__": + break + + progress_messages.append(message) + yield _ingestion_payload(mygraph, "\n".join(progress_messages[-50:]), str(documents_root)) + + if "exception" in error: + progress_messages.append(f"Error: {error['exception']}") + yield _ingestion_payload(mygraph, "\n".join(progress_messages[-50:]), str(documents_root)) + return + + summary = outcome["summary"] + mygraph = _load_graph_from_storage(str(documents_root)) + _PATHRAG_CACHE.pop(str(documents_root), None) + _LIGHTRAG_CACHE.pop(str(documents_root), None) + + pickle_note = "" + project_paths = resolve_project_paths(documents_root) + if not project_paths.graph_pickle_file.exists(): + saved_path = _save_graph_pickle(str(documents_root), mygraph) + if saved_path is not None: + pickle_note = f"\nSaved baseline graph pickle to {saved_path}" + else: + pickle_note = f"\nExisting working graph pickle preserved at {project_paths.graph_pickle_file}" + + status = ( + f"Ingested project at {documents_root}\n" + f"Processed files: {summary['processed_files']}\n" + f"Skipped files: {summary['skipped_files']}\n" + f"Removed files: {summary['removed_files']}\n" + f"Chunks: {summary['chunk_count']}\n" + f"Entities: {summary['entity_count']}\n" + f"Relations: {summary['relation_count']}" + f"\nRetrieval snapshot: {project_paths.retrieval_graph_pickle_file}" + f"{pickle_note}\n\n" + f"{chr(10).join(progress_messages[-20:])}" + ) + yield _ingestion_payload(mygraph, status, str(documents_root)) -def toggle_checkboxgroup(checkbox_value): - # Show the checkbox group if the checkbox is checked, hide it otherwise - return gr.update(visible=checkbox_value) +def save_current_graph(folder_path: str) -> str: + if not folder_path: + return "Select and ingest a document folder first." + saved_path = _save_graph_pickle(folder_path, mygraph) + if saved_path is None: + return "Unable to determine a project path for the current graph." + return f"Saved working graph pickle to {saved_path}" -# ===================================================== -# -------- GRADIO INTERFACE --------------------------- -# ===================================================== -chathistory = [] -screenwidth = GetSystemMetrics(0) -screenheight = GetSystemMetrics(1) - -my_theme=themes.Soft(primary_hue="blue", - secondary_hue="gray", - font=[themes.GoogleFont("Oxanium"), "Arial", "sans-serif"], - spacing_size=themes.sizes.spacing_sm, - text_size=themes.sizes.text_sm) - -with gr.Blocks(theme="glass", css=".prompt {color: green}") as demo: -# with gr.Blocks(theme=my_theme, css=".prompt {color: green}") as demo: -# with gr.Blocks(theme=my_theme) as demo: + +def load_saved_graph(folder_path: str) -> Tuple[str, str, str, Any, Any, Any]: + global mygraph + + if not folder_path: + updates = update_dropdowns() + return render_graph_iframe(mygraph), generate_legend_html(mygraph), "Select and ingest a document folder first.", *updates + + project_paths = resolve_project_paths(folder_path) + if not project_paths.graph_pickle_file.exists(): + updates = update_dropdowns() + return ( + render_graph_iframe(mygraph), + generate_legend_html(mygraph), + f"No saved graph pickle found yet at {project_paths.graph_pickle_file}", + *updates, + ) + + try: + mygraph = _load_graph_from_pickle(folder_path) + message = f"Loaded working graph pickle from {project_paths.graph_pickle_file}" + except Exception as exc: + updates = update_dropdowns() + return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"Failed to load saved graph pickle: {exc}", *updates + + updates = update_dropdowns() + return render_graph_iframe(mygraph), generate_legend_html(mygraph), message, *updates + + +def merge_nodes( + node1: str, + node2: str, + active_folder: str, +) -> Tuple[str, str, str, Any, Any, Any]: + global mygraph + + if not node1 or not node2: + updates = update_dropdowns() + return render_graph_iframe(mygraph), generate_legend_html(mygraph), "Select two nodes to merge.", *updates + if node1 not in mygraph or node2 not in mygraph: + updates = update_dropdowns() + return render_graph_iframe(mygraph), generate_legend_html(mygraph), "Both nodes must exist in the current graph.", *updates + if node1 == node2: + updates = update_dropdowns() + return render_graph_iframe(mygraph), generate_legend_html(mygraph), "Cannot merge the same node into itself.", *updates + + new_node = f"{node1}_{node2}" + suffix = 1 + while new_node in mygraph: + suffix += 1 + new_node = f"{node1}_{node2}_{suffix}" + + label1 = mygraph.nodes[node1].get("label", node1) + label2 = mygraph.nodes[node2].get("label", node2) + mygraph.add_node( + new_node, + label=f"{label1} + {label2}", + type="Merged", + description=f"Merged from {node1} and {node2}", + source="Merged", + ) + + for original in (node1, node2): + for neighbor, attrs in list(mygraph[original].items()): + if neighbor != new_node and neighbor not in (node1, node2): + mygraph.add_edge(new_node, neighbor, **attrs) + mygraph.remove_node(original) + + autosave = "" + saved_path = _save_graph_pickle(active_folder, mygraph) + if saved_path is not None: + autosave = f" Auto-saved to {saved_path}." + + updates = update_dropdowns() + return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"Merged '{node1}' and '{node2}' into '{new_node}'.{autosave}", *updates + + +def update_node_attributes( + node_id: str, + new_label: str, + new_type: str, + new_desc: str, + new_source: str, + active_folder: str, +) -> Tuple[str, str, str, Any, Any, Any]: + global mygraph + + if not node_id: + updates = update_dropdowns() + return render_graph_iframe(mygraph), generate_legend_html(mygraph), "Select a node to update.", *updates + if node_id not in mygraph: + updates = update_dropdowns() + return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"Node '{node_id}' was not found in the current graph.", *updates + + if new_label: + mygraph.nodes[node_id]["label"] = new_label + if new_type: + mygraph.nodes[node_id]["type"] = new_type + if new_desc: + mygraph.nodes[node_id]["description"] = new_desc + if new_source: + mygraph.nodes[node_id]["source"] = new_source + + autosave = "" + saved_path = _save_graph_pickle(active_folder, mygraph) + if saved_path is not None: + autosave = f" Auto-saved to {saved_path}." + + updates = update_dropdowns() + return render_graph_iframe(mygraph), generate_legend_html(mygraph), f"Updated node '{node_id}'.{autosave}", *updates + + +async def create_pathrag_response( + question: str, + chat_history: List[dict], + active_folder: str, +) -> Tuple[str, List[dict], str]: + if not active_folder: + chat_history = list(chat_history or []) + chat_history.append({"role": "assistant", "content": "Select and ingest a document folder first."}) + return "", chat_history, "" + + history = list(chat_history or []) + history.append({"role": "user", "content": question}) + try: + rag = _get_pathrag(active_folder) + result = await rag.aretrieve(question, conversation_history=_history_to_turns(history[:-1])) + history.append({"role": "assistant", "content": result.answer}) + + sources = [] + for index, chunk in enumerate(result.chunk_matches, start=1): + head = chunk.filename or chunk.document_id or "(unknown doc)" + sources.append(f"{index}. {head} (score={chunk.score:.3f})") + sources.append(chunk.text) + sources.append("-" * 46) + + return "", history, "\n".join(sources) + except Exception as exc: + history.append({"role": "assistant", "content": f"PathRAG error: {exc}"}) + return "", history, f"PathRAG error: {exc}" + + +async def create_lightrag_response( + question: str, + chat_history: List[dict], + active_folder: str, +) -> Tuple[str, List[dict], str]: + if not active_folder: + chat_history = list(chat_history or []) + chat_history.append({"role": "assistant", "content": "Select and ingest a document folder first."}) + return "", chat_history, "" + + history = list(chat_history or []) + history.append({"role": "user", "content": question}) + try: + rag = _get_lightrag(active_folder) + result = await rag.aretrieve(question, conversation_history=_history_to_turns(history[:-1])) + history.append({"role": "assistant", "content": result.answer}) + + sources = [] + for index, chunk in enumerate(result.all_chunks, start=1): + source_type = chunk.get("source_type", "unknown") + line = f"{index}. {source_type}" + if source_type == "vector" and chunk.get("score") is not None: + line += f" (score={float(chunk['score']):.3f})" + sources.append(line) + sources.append(chunk.get("text", "")) + sources.append("-" * 46) + + return "", history, "\n".join(sources) + except Exception as exc: + history.append({"role": "assistant", "content": f"LightRAG error: {exc}"}) + return "", history, f"LightRAG error: {exc}" + + +with gr.Blocks() as demo: gr.Markdown("## Interactive Hybrid RAG") - #sidebar + active_folder = gr.State("") + with gr.Sidebar(): - # Folder and file(s) selection - folder_path_input = gr.Textbox( - label="Enter document folder (complete path)", - interactive=True, - ) - # button to start ingestion process - go_btn = gr.Button(value="Load Knowledge Graph", variant="primary") - - # Output status messages - status_messages = gr.Textbox( - label="Status", - interactive=False, - lines=8 - ) + folder_path_input = gr.Textbox(label="Document folder", placeholder="C:\\path\\to\\documents") + go_btn = gr.Button(value="Ingest Folder", variant="primary") + status_messages = gr.Textbox(label="Status", interactive=False, lines=16) with gr.Tabs(): with gr.Tab("Chat"): with gr.Tabs(): - with gr.Tab("PathRag"): + with gr.Tab("PathRAG"): + pathrag_chatbot = gr.Chatbot(type="messages", label="PathRAG Chat History", height=420) + pathrag_sources = gr.Textbox(label="PathRAG sources", interactive=False, lines=14) with gr.Row(): - # pathrag chatbot component - pathrag_chatbot = gr.Chatbot( - type="messages", - label="PathRag Chat History", - show_copy_button = True, - avatar_images=('./images/user.png','./images/bot.png'), - height=int(screenheight*0.4), - layout='bubble', - ) - with gr.Row(): - # pathrag chunk sources - pathrag_sources = gr.Textbox( - label="PathRag sources", - interactive=False, - lines=15, - max_lines=15, - show_copy_button=True - ) - with gr.Row(): - with gr.Column(scale=9): - # prompt textbox - pathrag_msg_input = gr.Textbox( - elem_id="prompt", - label="Your Question", - show_label=False, - placeholder="Type your question here and hit Enter...", - ) - with gr.Column(scale=1): - # clear conversation button - pathrag_clear_btn = gr.ClearButton(components=[pathrag_msg_input, pathrag_chatbot, pathrag_sources], - value="Clear conversation", - variant="primary", - scale=1) - with gr.Tab("LightRag"): - with gr.Row(): - # lightrag chatbot component - lightrag_chatbot = gr.Chatbot( - type="messages", - label="LightRag Chat History", - show_copy_button = True, - avatar_images=('./images/user.png','./images/bot.png'), - height=int(screenheight*0.4), - layout='bubble' + pathrag_msg_input = gr.Textbox(show_label=False, placeholder="Ask a question about the active project...") + pathrag_clear_btn = gr.ClearButton( + components=[pathrag_msg_input, pathrag_chatbot, pathrag_sources], + value="Clear conversation", ) + + with gr.Tab("LightRAG"): + lightrag_chatbot = gr.Chatbot(type="messages", label="LightRAG Chat History", height=420) + lightrag_sources = gr.Textbox(label="LightRAG sources", interactive=False, lines=14) with gr.Row(): - # lightrag chunk sources - lightrag_sources = gr.Textbox( - label="LightRag sources", - interactive=False, - lines=15, - max_lines=15, - show_copy_button=True + lightrag_msg_input = gr.Textbox(show_label=False, placeholder="Ask a question about the active project...") + lightrag_clear_btn = gr.ClearButton( + components=[lightrag_msg_input, lightrag_chatbot, lightrag_sources], + value="Clear conversation", ) - with gr.Row(): - with gr.Column(scale=9): - # prompt textbox - lightrag_msg_input = gr.Textbox( - elem_id="prompt", - label="Your Question", - show_label=False, - placeholder="Type your question here and hit Enter...", - ) - with gr.Column(scale=1): - # clear conversation button - lightrag_clear_btn = gr.ClearButton(components=[lightrag_msg_input, lightrag_chatbot, lightrag_sources], - value="Clear conversation", - variant="primary", - scale=1) - - # state = gr.State([]) - with gr.Tab("Knowledge Graph"): with gr.Row(): with gr.Column(scale=9): graph_html = gr.HTML(render_graph_iframe(mygraph)) with gr.Column(scale=1): - with gr.Row(): - # show legend - legend_html = gr.HTML(generate_legend_html(mygraph)) - + legend_html = gr.HTML(generate_legend_html(mygraph)) + with gr.Row(): + save_pickle_btn = gr.Button(value="Save Working Graph", variant="primary") + load_pickle_btn = gr.Button(value="Load Saved Graph") with gr.Tabs(): with gr.Tab("Edit Node"): - with gr.Row(): - edit_node_dropdown = gr.Dropdown(choices=[], label="Select Node") - edit_label = gr.Textbox(label="Label") - edit_type = gr.Textbox(label="Type") - edit_desc = gr.Textbox(label="Description") - edit_source = gr.Textbox(label="Source") - updatenode_btn = gr.Button(value="Update Node", variant="primary") + edit_node_dropdown = gr.Dropdown(choices=[], label="Select Node") + edit_label = gr.Textbox(label="Label") + edit_type = gr.Textbox(label="Type") + edit_desc = gr.Textbox(label="Description") + edit_source = gr.Textbox(label="Source") + updatenode_btn = gr.Button(value="Update Node", variant="primary") with gr.Tab("Merge Nodes"): - with gr.Row(): - m1 = gr.Dropdown(choices=[], label="Node 1") - m2 = gr.Dropdown(choices=[], label="Node 2") - mergenodes_btn = gr.Button(value="Merge Nodes", variant="primary") - with gr.Tab("Save Graph"): - save_pickle_btn = gr.Button(value="💾 Save Pickle", variant="primary") - - # --- Bindings --- - # sidebar components - # Go button click triggers ingestion process - go_btn.click(fn=handle_ingestion, - inputs=[folder_path_input], - outputs=[graph_html, status_messages] - ).then(fn=update_dropdowns, - outputs=[m1, m2, edit_node_dropdown] - ) - # then(fn=load_graph_from_pkl, inputs=[folder_path_input], outputs=[graph_html]). \ - # submission of prompt triggers respons process - pathrag_msg_input.submit(fn=create_pathrag_response, - inputs=[pathrag_msg_input, pathrag_chatbot], - outputs=[pathrag_msg_input, pathrag_chatbot, pathrag_sources]) - lightrag_msg_input.submit(fn=create_lightrag_response, - inputs=[lightrag_msg_input, lightrag_chatbot], - outputs=[lightrag_msg_input, lightrag_chatbot, lightrag_sources]) - - # clear prompt and chat history - pathrag_clear_btn.click(fn=lambda: [None, None, None], - inputs=[], - outputs=[pathrag_msg_input, pathrag_chatbot, pathrag_sources], - queue=False) - - lightrag_clear_btn.click(fn=lambda: [None, None, None], - inputs=[], - outputs=[lightrag_msg_input, lightrag_chatbot, lightrag_sources], - queue=False) - - # update contents of node - updatenode_btn.click(fn=update_node_attributes, - inputs=[edit_node_dropdown, edit_label, edit_type, edit_desc, edit_source], - outputs=[graph_html, legend_html, status_messages]) - # merge nodes - mergenodes_btn.click(fn=merge_nodes, - inputs=[m1, m2], - outputs=[graph_html, legend_html, status_messages]) - # save graph - save_pickle_btn.click(fn=lambda: save_graph_pickle(), - outputs=status_messages) - - demo.load(fn=update_dropdowns, - outputs=[m1, m2, edit_node_dropdown]) - -demo.launch(inbrowser=True, pwa=True) + m1 = gr.Dropdown(choices=[], label="Node 1") + m2 = gr.Dropdown(choices=[], label="Node 2") + mergenodes_btn = gr.Button(value="Merge Nodes", variant="primary") + + go_btn.click( + fn=handle_ingestion, + inputs=[folder_path_input], + outputs=[graph_html, legend_html, status_messages, active_folder, m1, m2, edit_node_dropdown], + ) + + save_pickle_btn.click( + fn=save_current_graph, + inputs=[active_folder], + outputs=[status_messages], + ) + + load_pickle_btn.click( + fn=load_saved_graph, + inputs=[active_folder], + outputs=[graph_html, legend_html, status_messages, m1, m2, edit_node_dropdown], + ) + + updatenode_btn.click( + fn=update_node_attributes, + inputs=[edit_node_dropdown, edit_label, edit_type, edit_desc, edit_source, active_folder], + outputs=[graph_html, legend_html, status_messages, m1, m2, edit_node_dropdown], + ) + + mergenodes_btn.click( + fn=merge_nodes, + inputs=[m1, m2, active_folder], + outputs=[graph_html, legend_html, status_messages, m1, m2, edit_node_dropdown], + ) + + pathrag_msg_input.submit( + fn=create_pathrag_response, + inputs=[pathrag_msg_input, pathrag_chatbot, active_folder], + outputs=[pathrag_msg_input, pathrag_chatbot, pathrag_sources], + ) + lightrag_msg_input.submit( + fn=create_lightrag_response, + inputs=[lightrag_msg_input, lightrag_chatbot, active_folder], + outputs=[lightrag_msg_input, lightrag_chatbot, lightrag_sources], + ) + + pathrag_clear_btn.click( + fn=lambda: [None, None, None], + inputs=[], + outputs=[pathrag_msg_input, pathrag_chatbot, pathrag_sources], + queue=False, + ) + lightrag_clear_btn.click( + fn=lambda: [None, None, None], + inputs=[], + outputs=[lightrag_msg_input, lightrag_chatbot, lightrag_sources], + queue=False, + ) + + demo.load( + fn=update_dropdowns, + outputs=[m1, m2, edit_node_dropdown], + ) + + +if __name__ == "__main__": + demo.queue() + demo.launch(inbrowser=True, pwa=True) diff --git a/graph/chunker.py b/graph/chunker.py index cfbde1e..ca10aa7 100644 --- a/graph/chunker.py +++ b/graph/chunker.py @@ -184,14 +184,14 @@ def chunk_parsed_pages( return chunks i = 0 # index of the next *new* sentence to place - prev_chunk_sentence_indices: List[int] = [] + prev_new_sentence_indices: List[int] = [] while i < len(sentences): # Determine overlap sentences (from tail of previous chunk) within overlap_chars overlap: List[Sentence] = [] - if prev_chunk_sentence_indices and overlap_chars > 0: + if prev_new_sentence_indices and overlap_chars > 0: # Walk backwards over previous chunk's sentence indices, collect until we hit the char budget - tail = [sentences[k] for k in prev_chunk_sentence_indices] + tail = [sentences[k] for k in prev_new_sentence_indices] total = 0 tmp: List[Sentence] = [] for s in reversed(tail): @@ -256,11 +256,12 @@ def chunk_parsed_pages( "included_new_sentence_count": len(new_sents), "include_overlap_in_limit": include_overlap_in_limit, "max_chars_target": max_chars, + "exceeds_target": len(text) > max_chars, } chunks.append(chunk) # Prepare for next iteration - prev_chunk_sentence_indices = chunk_sentence_indices + prev_new_sentence_indices = [s.idx for s in new_sents] return chunks @@ -302,4 +303,4 @@ def chunk_text( print(f"[{ch['chunk_id']}] p{ch['start_page']}–p{ch['end_page']} ({ch['char_count']} chars)" f" | overlap={ch['overlap_chars_effective']}") print(ch["text"]) - print("---") \ No newline at end of file + print("---") diff --git a/graph/db_storage.py b/graph/db_storage.py index 637f0c3..3018dbf 100644 --- a/graph/db_storage.py +++ b/graph/db_storage.py @@ -1,9 +1,12 @@ from __future__ import annotations import os +import sys import sqlite3 import json import hashlib +import math +import logging from typing import Dict, Any, List, Optional, Sequence, Tuple from contextlib import contextmanager from dataclasses import replace @@ -11,6 +14,8 @@ from llm import Embedder from settings import settings, StoragePaths as SettingsStoragePaths +LOGGER = logging.getLogger("appl_kgraph.storage") + # --------------------------- # Helpers # --------------------------- @@ -355,6 +360,13 @@ def get_chunks_by_uuids(self, chunk_uuids: List[str]) -> List[Dict[str, Any]]: cur.execute(f"SELECT * FROM chunks WHERE chunk_uuid IN ({placeholders});", chunk_uuids) rows = cur.fetchall() return [dict(zip(self.KEYS, row)) for row in rows] + + def list_chunks(self) -> List[Dict[str, Any]]: + with self.connect() as con: + cur = con.cursor() + cur.execute("SELECT * FROM chunks ORDER BY filename, chunk_id;") + rows = cur.fetchall() + return [dict(zip(self.KEYS, row)) for row in rows] def delete_chunks_by_doc_id(self, doc_id: str) -> None: with self.connect() as con: @@ -495,6 +507,18 @@ def get_nodes(self, names: List[str]) -> List[Dict[str, Any]]: ''', tuple(names)) rows = cur.fetchall() return [dict(zip(keys, row)) for row in rows] if rows else [] + + def list_nodes(self) -> List[Dict[str, Any]]: + keys = [k for k in self.KEYS_NODE if k != "id"] + with self.connect() as con: + cur = con.cursor() + cur.execute(''' + SELECT name, type, description, source_id, filepath + FROM nodes + ORDER BY name; + ''') + rows = cur.fetchall() + return [dict(zip(keys, row)) for row in rows] if rows else [] def get_nodes_by_chunk_uuids(self, chunk_uuids: List[str]) -> List[Dict[str, Any]]: @@ -621,6 +645,18 @@ def get_edges(self, pairs: List[Tuple[str, str]]) -> List[Dict[str, Any]]: ''', tuple(flat_params)) rows = cur.fetchall() return [dict(zip(keys, row)) for row in rows] if rows else [] + + def list_edges(self) -> List[Dict[str, Any]]: + keys = [k for k in self.KEYS_EDGE if k not in {"id", "u_source_name", "u_target_name"}] + with self.connect() as con: + cur = con.cursor() + cur.execute(''' + SELECT source_name, target_name, weight, description, keywords, source_id, filepath + FROM edges + ORDER BY u_source_name, u_target_name; + ''') + rows = cur.fetchall() + return [dict(zip(keys, row)) for row in rows] if rows else [] def get_edges_by_chunk_uuids(self, chunk_uuids: List[str]) -> List[Dict[str, Any]]: if not chunk_uuids: @@ -932,6 +968,10 @@ def _edge_id(a: str, b: str) -> str: x, y = _normalize_pair(a, b) return f"{x}::{y}" + @staticmethod + def _embed_text(source_name: str, target_name: str, description: str, keywords: str) -> str: + return f"{source_name} <-> {target_name} :: {description} :: {keywords}" + def add_relations(self, relations: Sequence[Dict[str, Any]]) -> None: ids: List[str] = [] texts: List[str] = [] @@ -943,7 +983,7 @@ def add_relations(self, relations: Sequence[Dict[str, Any]]) -> None: kw = r.get("keywords", "") or "" ids.append(self._edge_id(src, tgt)) # Embed src + tgt + description + keywords - texts.append(f"{src} <-> {tgt} :: {desc} :: {kw}") + texts.append(self._embed_text(src, tgt, desc, kw)) metas.append({ "source_name": src, "target_name": tgt, @@ -974,7 +1014,7 @@ def upsert_relations(self, relations: Sequence[Dict[str, Any]]) -> None: kw = r.get("keywords", "") or "" ids.append(self._edge_id(src, tgt)) # Embed src + tgt + description + keywords - texts.append(f"{src} <-> {tgt} :: {desc} :: {kw}") + texts.append(self._embed_text(src, tgt, desc, kw)) metas.append({ "source_name": src, "target_name": tgt, @@ -1027,6 +1067,12 @@ def __init__(self, paths: Optional[StoragePaths] = None, embedder: Optional[Embe self.chunk_vectors = ChunkVectors(collection="chunks", chroma_dir=paths.chroma_chunks, embedder=self.embedder) self.entity_vectors = EntityVectors(collection="entities", chroma_dir=paths.chroma_entities, embedder=self.embedder) self.relation_vectors = RelationVectors(collection="relations", chroma_dir=paths.chroma_relations, embedder=self.embedder) + self._chunk_search_cache: Optional[List[Tuple[Dict[str, Any], List[float]]]] = None + self._entity_search_cache: Optional[List[Tuple[Dict[str, Any], List[float]]]] = None + self._relation_search_cache: Optional[List[Tuple[Dict[str, Any], List[float]]]] = None + self._prefer_python_vector_search = self._should_use_python_vector_search() + self._vector_mutations_disabled = self._prefer_python_vector_search + self._vector_warning_emitted = False def init(self): """Create tables/collections if they don't exist yet.""" @@ -1034,6 +1080,206 @@ def init(self): self.chunksdb.init() self.graphdb.init() + def _invalidate_chunk_cache(self) -> None: + self._chunk_search_cache = None + + def _invalidate_entity_cache(self) -> None: + self._entity_search_cache = None + + def _invalidate_relation_cache(self) -> None: + self._relation_search_cache = None + + @staticmethod + def _cosine_similarity(a: Sequence[float], b: Sequence[float]) -> float: + if not a or not b or len(a) != len(b): + return 0.0 + dot = 0.0 + norm_a = 0.0 + norm_b = 0.0 + for x, y in zip(a, b): + dot += float(x) * float(y) + norm_a += float(x) * float(x) + norm_b += float(y) * float(y) + if norm_a <= 0.0 or norm_b <= 0.0: + return 0.0 + return max(0.0, dot / (math.sqrt(norm_a) * math.sqrt(norm_b))) + + @staticmethod + def _env_flag(name: str) -> Optional[bool]: + raw = os.getenv(name, "").strip().lower() + if not raw: + return None + if raw in {"1", "true", "yes", "on"}: + return True + if raw in {"0", "false", "no", "off"}: + return False + return None + + def _should_use_python_vector_search(self) -> bool: + force_python = self._env_flag("APPL_KGRAPH_FORCE_PYTHON_VECTOR_SEARCH") + if force_python is not None: + return force_python + force_chroma = self._env_flag("APPL_KGRAPH_FORCE_CHROMA_QUERY") + if force_chroma is not None: + return not force_chroma + return os.name == "nt" and sys.version_info >= (3, 12) + + def _warn_vector_backend_disabled(self) -> None: + if self._vector_mutations_disabled and not self._vector_warning_emitted: + LOGGER.warning( + "Native Chroma vector mutations are disabled on this environment; " + "falling back to SQL-backed similarity search." + ) + self._vector_warning_emitted = True + + @staticmethod + def _flatten_query_values(values: Any) -> List[Any]: + if values is None: + return [] + if isinstance(values, list) and values and isinstance(values[0], list): + return list(values[0]) + if isinstance(values, list): + return list(values) + return [values] + + @staticmethod + def _distance_to_similarity(value: Any) -> float: + try: + return max(0.0, 1.0 - float(value)) + except (TypeError, ValueError): + return 0.0 + + def _native_chunk_search(self, text: str, n_results: int) -> List[Dict[str, Any]]: + rows = self.chunk_vectors.query(text=text, n_results=n_results) or [] + matches: List[Dict[str, Any]] = [] + for row in rows: + ids = self._flatten_query_values(row.get("ids")) + documents = self._flatten_query_values(row.get("documents")) + metadatas = self._flatten_query_values(row.get("metadatas")) + distances = self._flatten_query_values(row.get("distances")) + size = max(len(ids), len(documents), len(metadatas), len(distances)) + for index in range(size): + metadata = metadatas[index] if index < len(metadatas) and isinstance(metadatas[index], dict) else {} + matches.append({ + "chunk_uuid": ids[index] if index < len(ids) else "", + "doc_id": metadata.get("doc_id", ""), + "filename": metadata.get("filename", ""), + "text": documents[index] if index < len(documents) else "", + "score": self._distance_to_similarity(distances[index] if index < len(distances) else None), + }) + return matches[:n_results] + + def _native_entity_search(self, text: str, n_results: int) -> List[Dict[str, Any]]: + rows = self.entity_vectors.query(text=text, n_results=n_results) or [] + matches: List[Dict[str, Any]] = [] + for row in rows: + metadatas = self._flatten_query_values(row.get("metadatas")) + distances = self._flatten_query_values(row.get("distances")) + for index, metadata in enumerate(metadatas): + if not isinstance(metadata, dict): + continue + matches.append({ + "name": metadata.get("name", ""), + "type": metadata.get("type", ""), + "description": metadata.get("description", ""), + "source_id": metadata.get("source_id", ""), + "filepath": metadata.get("filepath", ""), + "score": self._distance_to_similarity(distances[index] if index < len(distances) else None), + }) + return matches[:n_results] + + def _native_relation_search(self, text: str, n_results: int) -> List[Dict[str, Any]]: + rows = self.relation_vectors.query(text=text, n_results=n_results) or [] + matches: List[Dict[str, Any]] = [] + for row in rows: + metadatas = self._flatten_query_values(row.get("metadatas")) + distances = self._flatten_query_values(row.get("distances")) + for index, metadata in enumerate(metadatas): + if not isinstance(metadata, dict): + continue + matches.append({ + "source_name": metadata.get("source_name", ""), + "target_name": metadata.get("target_name", ""), + "description": metadata.get("description", ""), + "keywords": metadata.get("keywords", ""), + "weight": metadata.get("weight", 0), + "source_id": metadata.get("source_id", ""), + "filepath": metadata.get("filepath", ""), + "score": self._distance_to_similarity(distances[index] if index < len(distances) else None), + }) + return matches[:n_results] + + def _rank_records( + self, + *, + query_text: str, + cache: List[Tuple[Dict[str, Any], List[float]]], + n_results: int, + ) -> List[Dict[str, Any]]: + if not query_text or not cache or n_results <= 0: + return [] + query_embedding = self.embedder.embed_texts([query_text])[0] + scored: List[Tuple[float, Dict[str, Any]]] = [] + for record, embedding in cache: + score = self._cosine_similarity(query_embedding, embedding) + scored.append((score, record)) + scored.sort(key=lambda item: item[0], reverse=True) + return [{**record, "score": score} for score, record in scored[:n_results]] + + def search_chunks(self, text: str, n_results: int = 5) -> List[Dict[str, Any]]: + if not self._prefer_python_vector_search: + try: + return self._native_chunk_search(text=text, n_results=n_results) + except Exception: + pass + if self._chunk_search_cache is None: + chunks = self.chunksdb.list_chunks() + texts = [chunk.get("text", "") or "" for chunk in chunks] + embeddings = self.embedder.embed_texts(texts) if texts else [] + self._chunk_search_cache = list(zip(chunks, embeddings)) + return self._rank_records(query_text=text, cache=self._chunk_search_cache, n_results=n_results) + + def search_entities(self, text: str, n_results: int = 5) -> List[Dict[str, Any]]: + if not self._prefer_python_vector_search: + try: + return self._native_entity_search(text=text, n_results=n_results) + except Exception: + pass + if self._entity_search_cache is None: + entities = self.graphdb.list_nodes() + texts = [ + EntityVectors._embed_text( + entity.get("name", ""), + entity.get("type", "") or "", + entity.get("description", "") or "", + ) + for entity in entities + ] + embeddings = self.embedder.embed_texts(texts) if texts else [] + self._entity_search_cache = list(zip(entities, embeddings)) + return self._rank_records(query_text=text, cache=self._entity_search_cache, n_results=n_results) + + def search_relations(self, text: str, n_results: int = 5) -> List[Dict[str, Any]]: + if not self._prefer_python_vector_search: + try: + return self._native_relation_search(text=text, n_results=n_results) + except Exception: + pass + if self._relation_search_cache is None: + relations = self.graphdb.list_edges() + texts = [ + RelationVectors._embed_text( + relation.get("source_name", ""), + relation.get("target_name", ""), + relation.get("description", "") or "", + relation.get("keywords", "") or "", + ) + for relation in relations + ] + embeddings = self.embedder.embed_texts(texts) if texts else [] + self._relation_search_cache = list(zip(relations, embeddings)) + return self._rank_records(query_text=text, cache=self._relation_search_cache, n_results=n_results) + def get_llm_cache(self, model: str, prompt_sha: str, text_sha: str, max_age_hours: int) -> Optional[str]: return self.documentsdb.get_llm_cache(model, prompt_sha, text_sha, max_age_hours) @@ -1049,17 +1295,23 @@ def add_document(self, metadata: Dict[str, Any], full_text: str) -> None: # 2) Chunks schema def add_chunks(self, chunks: Sequence[Dict[str, Any]]) -> None: self.chunksdb.add_chunks(chunks) + self._invalidate_chunk_cache() # 3) Knowledge Graph schema def add_kg_nodes(self, nodes: List[Dict[str, Any]]) -> None: self.graphdb.add_nodes(nodes) + self._invalidate_entity_cache() def add_kg_edges(self, edges: List[Dict[str, Any]]) -> None: self.graphdb.add_edges(edges) + self._invalidate_relation_cache() # 4) Chunk vectors def add_chunk_vectors(self, chunks: Sequence[Dict[str, Any]]) -> None: if chunks: + if self._vector_mutations_disabled: + self._warn_vector_backend_disabled() + return self.chunk_vectors.add_chunks(chunks) # 5) Entity vectors @@ -1069,6 +1321,9 @@ def add_entity_vectors(self, entities: Sequence[Dict[str, Any]]) -> None: Uniqueness enforced via entity name. """ if entities: + if self._vector_mutations_disabled: + self._warn_vector_backend_disabled() + return self.entity_vectors.add_entities(entities) # 6) Relation vectors @@ -1079,6 +1334,9 @@ def add_relation_vectors(self, relations: Sequence[Dict[str, Any]]) -> None: Uniqueness enforced via normalized ID "min::max". """ if relations: + if self._vector_mutations_disabled: + self._warn_vector_backend_disabled() + return self.relation_vectors.add_relations(relations) # ---------- Get-only APIs ---------- @@ -1134,6 +1392,8 @@ def get_edges_by_chunk_uuids(self, chunk_uuids: List[str]) -> List[Dict[str, Any # 4) Chunk Vectors def get_chunk_vector(self, chunk_uuid: str) -> Optional[Dict[str, Any]]: + if self._vector_mutations_disabled: + return None return self.chunk_vectors.get([chunk_uuid]) # 5) Entity Vectors @@ -1144,11 +1404,15 @@ def get_entities(self, names: List[str]) -> List[Dict[str, Any]]: """ if not names: return [] + if self._vector_mutations_disabled: + return [] return self.entity_vectors.get_entities(names) # 6) Relation Vectors def get_relations(self, pairs: List[Tuple[str, str]]) -> Optional[Dict[str, Any]]: + if self._vector_mutations_disabled: + return [] return self.relation_vectors.get_relations(pairs) # ---------- Delete-only APIs ---------- @@ -1160,12 +1424,15 @@ def delete_document(self, doc_id: str) -> None: # 2) Chunks def delete_chunks_by_doc_id(self, doc_id: str) -> None: self.chunksdb.delete_chunks_by_doc_id(doc_id) + self._invalidate_chunk_cache() def delete_chunk_by_uuid(self, chunk_uuid: str) -> None: self.chunksdb.delete_chunk_by_uuid(chunk_uuid) + self._invalidate_chunk_cache() def delete_chunks_by_uuids(self, chunk_uuids: List[str]) -> None: self.chunksdb.delete_chunks_by_uuids(chunk_uuids) + self._invalidate_chunk_cache() # 3) Graph def delete_node(self, name: str) -> None: @@ -1173,21 +1440,27 @@ def delete_node(self, name: str) -> None: Pass-through: delete one graph node by name. """ self.graphdb.delete_node(name) + self._invalidate_entity_cache() def delete_nodes(self, names: List[str]) -> None: """ Pass-through: bulk delete graph nodes by name. """ self.graphdb.delete_nodes(names) + self._invalidate_entity_cache() def delete_edge(self, source_name: str, target_name: str) -> None: self.graphdb.delete_edge(source_name, target_name) + self._invalidate_relation_cache() def delete_edges(self, pairs: List[Tuple[str, str]]) -> None: self.graphdb.delete_edges(pairs) + self._invalidate_relation_cache() # 4) Chunk Vectors def delete_chunk_vector(self, chunk_uuid: str) -> None: + if self._vector_mutations_disabled: + return self.chunk_vectors.delete([chunk_uuid]) # 5) Entity Vectors @@ -1197,10 +1470,14 @@ def delete_entity_vector(self, names: List[str]) -> None: """ if not names: return + if self._vector_mutations_disabled: + return self.entity_vectors.delete_entities(names) # 6) Relation Vectors def delete_relation_vector(self, pairs: List[Tuple[str, str]]) -> None: + if self._vector_mutations_disabled: + return self.relation_vectors.delete_relations(pairs) # ---------- Update and Upsert APIs ---------- @@ -1212,6 +1489,7 @@ def upsert_document(self, doc_id: str, updates: Dict[str, Any]) -> None: # 2) Chunks def upsert_chunk(self, chunk_uuid: str, updates: Dict[str, Any]) -> None: self.chunksdb.update_chunk(chunk_uuid, updates) + self._invalidate_chunk_cache() # 3) Graph def upsert_node(self, name: str, updates: Dict[str, Any]) -> None: @@ -1247,6 +1525,7 @@ def upsert_nodes(self, updates_list: List[Dict[str, Any]]) -> None: self.graphdb.add_nodes(to_add) if to_update: self.graphdb.update_nodes(to_update) + self._invalidate_entity_cache() def upsert_edge(self, source_name: str, target_name: str, updates: Dict[str, Any]) -> None: self.graphdb.update_edge(source_name, target_name, updates) @@ -1264,15 +1543,31 @@ def upsert_edges(self, updates_list: List[Dict[str, Any]]) -> None: self.graphdb.add_edges(to_add) if to_update: self.graphdb.update_edges(to_update) + self._invalidate_relation_cache() # 4) Chunk Vectors def upsert_chunk_vector(self, chunks: Sequence[Dict[str, Any]]) -> None: + if self._vector_mutations_disabled: + self._warn_vector_backend_disabled() + self._invalidate_chunk_cache() + return self.chunk_vectors.upsert(chunks) + self._invalidate_chunk_cache() # 5) Entity Vectors def upsert_entity_vector(self, entities: List[Dict[str, Any]]) -> None: + if self._vector_mutations_disabled: + self._warn_vector_backend_disabled() + self._invalidate_entity_cache() + return self.entity_vectors.upsert_entities(entities) + self._invalidate_entity_cache() # 6) Relation Vectors def upsert_relation_vector(self, relations: List[Dict[str, Any]]) -> None: - self.relation_vectors.upsert_relations(relations) \ No newline at end of file + if self._vector_mutations_disabled: + self._warn_vector_backend_disabled() + self._invalidate_relation_cache() + return + self.relation_vectors.upsert_relations(relations) + self._invalidate_relation_cache() diff --git a/graph/extractor.py b/graph/extractor.py index 0dff61a..c974c27 100644 --- a/graph/extractor.py +++ b/graph/extractor.py @@ -1,80 +1,60 @@ from __future__ import annotations -import re +import hashlib import json +import re +from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass -from prompts import PROMPTS -from llm import Chat from typing import Any, Dict, Iterable, List, Optional, Tuple -from settings import settings -from concurrent.futures import ThreadPoolExecutor, as_completed + from db_storage import Storage -import hashlib +from llm import Chat +from prompts import PROMPTS +from settings import settings +from utils import detect_language, normalize_language_name -# ───────────────────────────────────────────────────────────── -# Helpers -# ───────────────────────────────────────────────────────────── def _sha256(s: str) -> str: - """ - Computes SHA-256 hash of a string for caching keys. + return hashlib.sha256(s.encode("utf-8")).hexdigest() - Args: - s (str): The input string to hash. - Returns: - str: The hexadecimal representation of the SHA-256 hash. - """ - return hashlib.sha256(s.encode("utf-8")).hexdigest() +def _default_entity_types() -> List[str]: + return list(settings.prompts.default_entity_types or PROMPTS["DEFAULT_ENTITY_TYPES"]) -# ───────────────────────────────────────────────────────────── -# Prompt Builder -# ───────────────────────────────────────────────────────────── def build_entity_relation_prompt( text: str, language: Optional[str] = None, entity_types: Optional[Iterable[str]] = None, ) -> str: - """ - Builds a prompt for entity and relationship extraction from text. - - Fills the entity extraction prompt template with appropriate delimiters, language, - examples, and entity types. Formats examples to replace placeholder literals. - - Args: - text (str): The input text chunk to extract entities and relationships from. - language (Optional[str], optional): The output language for extraction. - Defaults to the configured default language. - entity_types (Optional[Iterable[str]], optional): The types of entities to extract. - Defaults to the configured default entity types. - - Returns: - str: A formatted prompt ready for LLM consumption. - """ - # 1) Base context for the prompt (without examples yet) examples_template = "\n\n".join(PROMPTS.get("entity_extraction_examples", [])) ctx = dict( - tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"], - record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"], - completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"], - entity_types=", ".join(entity_types) if entity_types else ", ".join(PROMPTS["DEFAULT_ENTITY_TYPES"]), + tuple_delimiter=settings.prompts.tuple_delimiter, + record_delimiter=settings.prompts.record_delimiter, + completion_delimiter=settings.prompts.completion_delimiter, + entity_types=", ".join(entity_types or _default_entity_types()), examples="", - language=language or PROMPTS["DEFAULT_LANGUAGE"], + language=normalize_language_name(language, settings.prompts.default_language), input_text=text, ) + ctx["examples"] = examples_template.format(**ctx) + return PROMPTS["entity_extraction"].format(**ctx) - # 2) Join & format the examples with the SAME ctx so placeholders are replaced - examples = examples_template.format(**ctx) # fill in delimiters, entity_types, language - ctx["examples"] = examples - # 3) Finally format the main template (including the filled examples) - template = PROMPTS["entity_extraction"] - return template.format(**ctx) +def build_entity_audit_prompt( + text: str, + *, + initial_extraction: str, + language: Optional[str] = None, + entity_types: Optional[Iterable[str]] = None, +) -> str: + return PROMPTS["entity_extraction_audit"].format( + input_text=text, + initial_extraction=initial_extraction, + language=normalize_language_name(language, settings.prompts.default_language), + entity_types=", ".join(entity_types or _default_entity_types()), + ) -# ───────────────────────────────────────────────────────────── -# Parsing utilities (regex + delimiter tolerant) -# ───────────────────────────────────────────────────────────── @dataclass class ParsedOutput: @@ -85,35 +65,21 @@ class ParsedOutput: _FANCY_QUOTES = { - "“": '"', "”": '"', "„": '"', - "‘": "'", "’": "'", + "“": '"', + "”": '"', + "„": '"', + "‘": "'", + "’": "'", } -def _normalize_quotes(s: str) -> str: - """ - Normalizes fancy/smart quotes to standard ASCII quotes. - - Args: - s (str): The input string with potential fancy quotes. - Returns: - str: The string with all fancy quotes replaced by standard quotes. - """ - for k, v in _FANCY_QUOTES.items(): - s = s.replace(k, v) +def _normalize_quotes(s: str) -> str: + for key, value in _FANCY_QUOTES.items(): + s = s.replace(key, value) return s def _strip_parens(s: str) -> str: - """ - Removes surrounding parentheses from a string if present. - - Args: - s (str): The input string. - - Returns: - str: The string with outer parentheses removed, or the original string if no parentheses. - """ s = s.strip() if s.startswith("(") and s.endswith(")"): return s[1:-1].strip() @@ -121,38 +87,18 @@ def _strip_parens(s: str) -> str: def _strip_quotes(s: str) -> str: - """ - Removes surrounding quotes from a string if present. - - Args: - s (str): The input string. - - Returns: - str: The string with outer quotes removed, or the original string if no quotes. - """ - s = s.strip() - s = _normalize_quotes(s) + s = _normalize_quotes(s.strip()) if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")): return s[1:-1] return s def _to_float_or_none(x: str) -> Optional[float]: - """ - Extracts and converts a floating-point number from a string. - - Args: - x (str): The input string containing a number. - - Returns: - Optional[float]: The extracted float value, or None if no valid number is found. - """ - x = x.strip() - m = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", x) - if not m: + match = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", x.strip()) + if not match: return None try: - return float(m.group(0)) + return float(match.group(0)) except Exception: return None @@ -163,108 +109,65 @@ def parse_model_output( record_delim: Optional[str] = None, completion_delim: Optional[str] = None, ) -> ParsedOutput: - """ - Parse the LLM output into entities and relationships using regex/splits. - - Expected generated record shapes (from your prompt): - ("entity"<|>"name"<|>"type"<|>"description") - ("relationship"<|>"source"<|>"target"<|>"description"<|>"keywords"<|>"strength") - ("content_keywords"<|>"kw1, kw2, ...") - - Tolerates both real delimiters and accidental literal - '{record_delimiter}' / '{tuple_delimiter}' tokens from the model. - """ - tuple_delim = tuple_delim or PROMPTS["DEFAULT_TUPLE_DELIMITER"] - record_delim = record_delim or PROMPTS["DEFAULT_RECORD_DELIMITER"] - completion_delim = completion_delim or PROMPTS["DEFAULT_COMPLETION_DELIMITER"] + tuple_delim = tuple_delim or settings.prompts.tuple_delimiter + record_delim = record_delim or settings.prompts.record_delimiter + completion_delim = completion_delim or settings.prompts.completion_delimiter raw = _normalize_quotes(raw) + raw = raw.replace("{tuple_delimiter}", tuple_delim) + raw = raw.replace("{record_delimiter}", record_delim) + raw = raw.replace("{completion_delimiter}", completion_delim) - # Defensive: handles examples that slipped through or model echoes - raw = raw.replace("{tuple_delimiter}", tuple_delim) \ - .replace("{record_delimiter}", record_delim) \ - .replace("{completion_delimiter}", completion_delim) - - - # Truncate at completion delimiter if present if completion_delim in raw: raw = raw.split(completion_delim, 1)[0] - - # Some models may echo headings, keep only after "Output:" if present if "Output:" in raw: raw = raw.split("Output:", 1)[1] - # Split into records by record delimiter (tolerate trailing spaces/newlines) - recs = re.split(rf"{re.escape(record_delim)}\s*", raw) - recs = [r.strip() for r in recs if r.strip()] + records = re.split(rf"{re.escape(record_delim)}\s*", raw) + records = [record.strip() for record in records if record.strip()] + entities: List[Dict[str, Any]] = [] relationships: List[Dict[str, Any]] = [] content_keywords: List[str] = [] - for rec in recs: - body = _strip_parens(rec) - # Split fields by tuple delimiter - parts = [p.strip() for p in body.split(tuple_delim)] - parts = [_strip_quotes(p) for p in parts if p.strip() != ""] - + for record in records: + body = _strip_parens(record) + parts = [_strip_quotes(part.strip()) for part in body.split(tuple_delim) if part.strip()] if not parts: continue - tag = parts[0].lower() + tag = parts[0].lower() if tag == "entity" and len(parts) >= 4: - # ("entity", name, type, description) - name, etype, desc = parts[1], parts[2], parts[3] - entities.append({ - "name": name, - "type": etype, - "description": desc, - }) - + entities.append( + { + "name": parts[1], + "type": parts[2], + "description": parts[3], + } + ) elif tag == "relationship" and len(parts) >= 6: - # Supported formats: - # 6-tuple: ("relationship", source_name, target_name, description, keywords, strength) _, src, tgt, desc, keywords, strength = parts[:6] - relationships.append({ - "source_name": src, - "target_name": tgt, - "description": desc, - "keywords": keywords, - "weight": _to_float_or_none(strength), - }) - + relationships.append( + { + "source_name": src, + "target_name": tgt, + "description": desc, + "keywords": keywords, + "weight": _to_float_or_none(strength), + } + ) elif tag == "content_keywords" and len(parts) >= 2: - # ("content_keywords", "kw1, kw2, ...") - kws = [k.strip() for k in parts[1].split(",") if k.strip()] - content_keywords.extend(kws) - - else: - # Unknown tag—keep raw for debugging (raw_records contains it) - pass + content_keywords.extend([part.strip() for part in parts[1].split(",") if part.strip()]) return ParsedOutput( entities=entities, relationships=relationships, content_keywords=content_keywords, - raw_records=recs, + raw_records=records, ) -# ───────────────────────────────────────────────────────────── -# Public extraction API (chunk-by-chunk + batch) -# ───────────────────────────────────────────────────────────── def _get_chunk_text(chunk: Dict[str, Any]) -> str: - """ - Extracts text content from a chunk dictionary. - - Args: - chunk (Dict[str, Any]): A chunk dictionary that may contain text under various keys. - - Returns: - str: The text content of the chunk. - - Raises: - KeyError: If the chunk doesn't contain text under expected keys. - """ for key in ("text", "content", "body"): if key in chunk and isinstance(chunk[key], str): return chunk[key] @@ -272,220 +175,300 @@ def _get_chunk_text(chunk: Dict[str, Any]) -> str: def _require_chunk_uuid(chunk: Dict[str, Any]) -> str: - """ - Validates and extracts the chunk_uuid from a chunk dictionary. - - Args: - chunk (Dict[str, Any]): A chunk dictionary. - - Returns: - str: The chunk_uuid as a string. - - Raises: - KeyError: If chunk_uuid is missing or empty. - """ if "chunk_uuid" not in chunk or not chunk["chunk_uuid"]: raise KeyError("Each chunk MUST include 'chunk_uuid' (used as source_id).") return str(chunk["chunk_uuid"]) -def extract_from_chunks( - chunks: Iterable[Dict[str, Any]], - language: Optional[str] = None, - entity_types: Optional[Iterable[str]] = None, - client: Optional[Chat] = None, -) -> Dict[str, Any]: - """ - High-level convenience: iterate chunks, call LLM, parse, and return collected results. - Returns dict with 'entities', 'relationships', 'content_keywords'. - - PERFORMANCE ENHANCEMENTS: - - Reuses a single Chat client (singleton) to keep HTTP sessions hot. - - Adds a content-addressed LLM cache in SQLite (model + prompt_sha + text_sha). - *Cache stores RAW model text; on hits we parse it exactly like fresh outputs.* - - Calls the LLM ONLY for cache misses; hits are stitched back in order. - - Runs LLM calls for misses concurrently (bounded by settings-based concurrency). - - SETTINGS (from settings.py): - - Concurrency: settings.perf.max_concurrency (fallback 6 if missing) - - Cache on/off: settings.perf.cache_enabled (fallback True if missing) - - Cache TTL: settings.perf.cache_max_age_hours (fallback 720 if missing) - - Provenance: - - Even on cache hits, each extracted record is stamped with this file/chunk's - source identifiers so cross-file queries remain accurate. - - NOTES: - - Prompts are built with `build_entity_relation_prompt(...)`. - - Parsing uses `parse_model_output(...)`. - """ - # Pull settings - MAX_WORKERS = settings.llmperf.max_concurrency - CACHE_ENABLED = settings.llmperf.cache_enabled - CACHE_MAX_AGE_HOURS = settings.llmperf.cache_max_age_hours - - # 1) Shared LLM client + create Storage facade that connects to SQLite & vector DB - chat = client or Chat.singleton() - storage: Optional[Storage] - try: - storage = Storage() - except: - raise - else: - storage.init() - - # 2) Materialize chunks to keep stable indexing for stitching results - chunk_list: List[Dict[str, Any]] = list(chunks) - - # 3) Build the exact prompts for each chunk - # and compute cache keys (prompt_sha, text_sha) per chunk. - prompts: List[str] = [] - keys: List[Tuple[str, str]] = [] # (prompt_sha, text_sha) - for ch in chunk_list: - txt = _get_chunk_text(ch) - prompt = build_entity_relation_prompt( - text=txt, - language=language, - entity_types=entity_types, - ) - prompts.append(prompt) - keys.append((_sha256(prompt), _sha256(txt))) - # 4) Probe cache; mark misses +def _resolve_chunk_language( + chunk: Dict[str, Any], + *, + explicit_language: Optional[str] = None, +) -> str: + if explicit_language: + return normalize_language_name(explicit_language, settings.prompts.default_language) + + default_language = settings.prompts.default_language + if settings.extraction.use_chunk_language: + for key in ("chunk_language", "language", "document_language"): + if chunk.get(key): + return normalize_language_name(str(chunk.get(key)), default_language) + if settings.extraction.detect_chunk_language: + detected = detect_language(_get_chunk_text(chunk)) + if detected and detected != "unknown": + return normalize_language_name(detected, default_language) + elif chunk.get("document_language"): + return normalize_language_name(str(chunk.get("document_language")), default_language) + + return normalize_language_name(None, default_language) + + +def _ensure_storage(storage: Optional[Storage]) -> Storage: + active_storage = storage or Storage() + active_storage.init() + return active_storage + + +def _run_cached_prompts( + *, + chat: Chat, + storage: Storage, + prompts: List[str], + text_hash_inputs: List[str], + system_prompt: str, +) -> List[str]: + max_workers = settings.llmperf.max_concurrency + cache_enabled = settings.llmperf.cache_enabled + cache_max_age_hours = settings.llmperf.cache_max_age_hours model_name = chat.model - raw_outputs: List[Optional[str]] = [None] * len(chunk_list) + + keys = [(_sha256(prompt), _sha256(text_hash)) for prompt, text_hash in zip(prompts, text_hash_inputs)] + raw_outputs: List[Optional[str]] = [None] * len(prompts) to_run: List[int] = [] - if CACHE_ENABLED: - for i, (psha, tsha) in enumerate(keys): - cached = storage.get_llm_cache(model_name, psha, tsha, CACHE_MAX_AGE_HOURS) - if cached is not None: - raw_outputs[i] = cached + if cache_enabled: + for index, (prompt_sha, text_sha) in enumerate(keys): + cached = storage.get_llm_cache(model_name, prompt_sha, text_sha, cache_max_age_hours) + if cached is None: + to_run.append(index) else: - to_run.append(i) + raw_outputs[index] = cached else: - to_run = list(range(len(chunk_list))) + to_run = list(range(len(prompts))) - # 5) Call the LLM ONLY for cache misses, in parallel (bounded) - def _call_one(i: int) -> str: - return chat.generate(prompt=prompts[i], system="You extract entities and relationships precisely in the required format. Do not add commentary.") + def _call_one(index: int) -> str: + return chat.generate(prompt=prompts[index], system=system_prompt) if to_run: - with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex: - futs = {ex.submit(_call_one, i): i for i in to_run} - for fut in as_completed(futs): - i = futs[fut] - out = fut.result() - raw_outputs[i] = out - if CACHE_ENABLED and storage is not None: - psha, tsha = keys[i] - storage.put_llm_cache(model_name, psha, tsha, out) - - # 6) Parse outputs and attach per-chunk provenance (identical to your approach) - all_entities: List[Dict[str, Any]] = [] - all_relationships: List[Dict[str, Any]] = [] - all_keywords: List[str] = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(_call_one, index): index for index in to_run} + for future in as_completed(futures): + index = futures[future] + output = future.result() + raw_outputs[index] = output + if cache_enabled: + prompt_sha, text_sha = keys[index] + storage.put_llm_cache(model_name, prompt_sha, text_sha, output) + + return [output or "" for output in raw_outputs] + + +def _parse_audit_output(raw: str) -> Dict[str, Any]: + text = raw.strip() + if not text: + return {"missing_entities": [], "missing_relationships": [], "summary": ""} - for i, ch in enumerate(chunk_list): - raw = raw_outputs[i] or "" - parsed = parse_model_output(raw) # existing parser + try: + parsed = json.loads(text) + except json.JSONDecodeError: + match = re.search(r"\{.*\}", text, re.DOTALL) + if not match: + return { + "missing_entities": [], + "missing_relationships": [], + "summary": "", + "raw_response": text, + } + try: + parsed = json.loads(match.group(0)) + except json.JSONDecodeError: + return { + "missing_entities": [], + "missing_relationships": [], + "summary": "", + "raw_response": text, + } + + if not isinstance(parsed, dict): + return {"missing_entities": [], "missing_relationships": [], "summary": "", "raw_response": text} + + return { + "missing_entities": parsed.get("missing_entities", []) or [], + "missing_relationships": parsed.get("missing_relationships", []) or [], + "summary": parsed.get("summary", "") or "", + } - source_id = _require_chunk_uuid(ch) - filepath = ch.get("filepath") or ch.get("filename") - for e in parsed.entities: - e["source_id"] = source_id - e["filepath"] = filepath - for r in parsed.relationships: - r["source_id"] = source_id - r["filepath"] = filepath +def extract_from_chunks( + chunks: Iterable[Dict[str, Any]], + language: Optional[str] = None, + entity_types: Optional[Iterable[str]] = None, + client: Optional[Chat] = None, + storage: Optional[Storage] = None, + audit_enabled: Optional[bool] = None, +) -> Dict[str, Any]: + chat = client or Chat.singleton() + active_storage = _ensure_storage(storage) + chunk_list = list(chunks) + resolved_entity_types = list(entity_types or _default_entity_types()) + do_audit = settings.extraction.audit_second_pass_enabled if audit_enabled is None else audit_enabled + + chunk_languages = [ + _resolve_chunk_language(chunk, explicit_language=language) + for chunk in chunk_list + ] + chunk_texts = [_get_chunk_text(chunk) for chunk in chunk_list] + extraction_prompts = [ + build_entity_relation_prompt( + text=text, + language=chunk_language, + entity_types=resolved_entity_types, + ) + for text, chunk_language in zip(chunk_texts, chunk_languages) + ] + + extraction_outputs = _run_cached_prompts( + chat=chat, + storage=active_storage, + prompts=extraction_prompts, + text_hash_inputs=chunk_texts, + system_prompt="You extract entities and relationships precisely in the required format. Do not add commentary.", + ) - all_entities.extend(parsed.entities) - all_relationships.extend(parsed.relationships) + all_entities: List[Dict[str, Any]] = [] + all_relationships: List[Dict[str, Any]] = [] + all_keywords: List[str] = [] + chunk_results: List[Dict[str, Any]] = [] + + for chunk, chunk_language, raw_output in zip(chunk_list, chunk_languages, extraction_outputs): + parsed = parse_model_output(raw_output) + source_id = _require_chunk_uuid(chunk) + filepath = chunk.get("filepath") or chunk.get("filename") + + entities = [] + for entity in parsed.entities: + stamped = dict(entity) + stamped["source_id"] = source_id + stamped["filepath"] = filepath + entities.append(stamped) + + relationships = [] + for relationship in parsed.relationships: + stamped = dict(relationship) + stamped["source_id"] = source_id + stamped["filepath"] = filepath + relationships.append(stamped) + + all_entities.extend(entities) + all_relationships.extend(relationships) all_keywords.extend(parsed.content_keywords) + chunk_results.append( + { + "chunk_uuid": source_id, + "filepath": filepath, + "language": chunk_language, + "entities": entities, + "relationships": relationships, + "content_keywords": parsed.content_keywords, + "raw_output": raw_output, + } + ) + + audits: List[Dict[str, Any]] = [] + if do_audit and chunk_results: + audit_prompts = [] + audit_hash_inputs = [] + for chunk, chunk_result, chunk_language in zip(chunk_list, chunk_results, chunk_languages): + extraction_snapshot = json.dumps( + { + "entities": chunk_result["entities"], + "relationships": chunk_result["relationships"], + "content_keywords": chunk_result["content_keywords"], + }, + ensure_ascii=False, + ) + audit_prompts.append( + build_entity_audit_prompt( + _get_chunk_text(chunk), + initial_extraction=extraction_snapshot, + language=chunk_language, + entity_types=resolved_entity_types, + ) + ) + audit_hash_inputs.append(f"{_get_chunk_text(chunk)}\n{extraction_snapshot}") + + audit_outputs = _run_cached_prompts( + chat=chat, + storage=active_storage, + prompts=audit_prompts, + text_hash_inputs=audit_hash_inputs, + system_prompt="You audit extraction completeness. Return JSON only.", + ) + + for chunk_result, raw_audit in zip(chunk_results, audit_outputs): + parsed_audit = _parse_audit_output(raw_audit) + audits.append( + { + "chunk_uuid": chunk_result["chunk_uuid"], + "filepath": chunk_result["filepath"], + "language": chunk_result["language"], + **parsed_audit, + "raw_output": raw_audit, + } + ) return { "entities": all_entities, "relationships": all_relationships, "content_keywords": sorted(set(all_keywords)), + "chunk_results": chunk_results, + "audits": audits, } -# ───────────────────────────────────────────────────────────── -# CLI (optional) — quick test driver -# ───────────────────────────────────────────────────────────── -# !! Currently not used, but could be useful for single-chunk extraction def extract_entities_relations_for_chunk( chunk: Dict[str, Any], client: Chat, language: Optional[str] = None, entity_types: Optional[Iterable[str]] = None, ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[str]]: - """ - Run the entity/relationship prompt for a single chunk and parse the result. - - Sets entity['source_id'] = chunk['chunk_uuid'] - - Sets relation['source_id'] = chunk['chunk_uuid'] - - Also carries 'filepath' (or 'filename') if present - """ - text = _get_chunk_text(chunk) - prompt = build_entity_relation_prompt(text=text, language=language, entity_types=entity_types) - system = "You extract entities and relationships precisely in the required format. Do not add commentary." - - raw = client.generate(prompt=prompt, system=system) - parsed = parse_model_output(raw) - - # Attach source_id (strictly chunk_uuid) and filepath if provided on the chunk - source_id = _require_chunk_uuid(chunk) - filepath = chunk.get("filepath") or chunk.get("filename") - - for e in parsed.entities: - e["source_id"] = source_id - e["filepath"] = filepath - for r in parsed.relationships: - r["source_id"] = source_id - r["filepath"] = filepath - - return parsed.entities, parsed.relationships, parsed.content_keywords + result = extract_from_chunks( + [chunk], + language=language, + entity_types=entity_types, + client=client, + audit_enabled=False, + ) + return result["entities"], result["relationships"], result["content_keywords"] + def _load_chunks_from_path(path: str) -> List[Dict[str, Any]]: - """ - Load chunks from a JSON or JSONL file. - Each record should be a dict with at least 'text' or 'content' and 'chunk_uuid'. - """ - with open(path, "r", encoding="utf-8") as f: - data = f.read() + with open(path, "r", encoding="utf-8") as file: + data = file.read() try: - # Try JSON array obj = json.loads(data) if isinstance(obj, list): - return obj # type: ignore[return-value] + return obj raise ValueError("Expected a list of chunks in JSON.") except json.JSONDecodeError: - # Try JSONL chunks: List[Dict[str, Any]] = [] for line in data.splitlines(): line = line.strip() - if not line: - continue - chunks.append(json.loads(line)) + if line: + chunks.append(json.loads(line)) return chunks def _default_demo_chunks() -> List[Dict[str, Any]]: - return [{ - "chunk_uuid": "demo-1", - "text": "Apple launched the Vision Pro with help from Foxconn. Tim Cook presented it in Cupertino during WWDC.", - "filename": "/demo/path/a.txt" - }] + return [ + { + "chunk_uuid": "demo-1", + "text": "Apple launched the Vision Pro with help from Foxconn. Tim Cook presented it in Cupertino during WWDC.", + "filename": "/demo/path/a.txt", + } + ] def _print_summary(res: Dict[str, Any]) -> None: print("\nEntities:") - for e in res["entities"]: - print(f" - {e['name']} [{e['type']}] src={e.get('source_id')}") + for entity in res["entities"]: + print(f" - {entity['name']} [{entity['type']}] src={entity.get('source_id')}") print("\nRelationships:") - for r in res["relationships"]: - w = r.get("weight") - print(f" - {r['source_name']} <-> {r['target_name']} w={w} src={r.get('source_id')}") + for relationship in res["relationships"]: + print( + f" - {relationship['source_name']} <-> {relationship['target_name']} " + f"w={relationship.get('weight')} src={relationship.get('source_id')}" + ) if res["content_keywords"]: print("\nKeywords:", ", ".join(res["content_keywords"])) @@ -493,25 +476,25 @@ def _print_summary(res: Dict[str, Any]) -> None: if __name__ == "__main__": import argparse - ap = argparse.ArgumentParser( - description="Extract entities and relations from chunks using Azure OpenAI and regex parsing." + parser = argparse.ArgumentParser( + description="Extract entities and relations from chunks using the configured LLM and regex parsing." ) - ap.add_argument("--chunks", type=str, default="", help="Path to JSON/JSONL with chunks (each has text/content and chunk_uuid).") - ap.add_argument("--language", type=str, default="", help="Override output language (default from prompts).") - ap.add_argument("--entity-types", type=str, default="", help="Comma-separated entity types to enforce.") - args = ap.parse_args() + parser.add_argument("--chunks", type=str, default="", help="Path to JSON/JSONL with chunks.") + parser.add_argument("--language", type=str, default="", help="Override output language.") + parser.add_argument("--entity-types", type=str, default="", help="Comma-separated entity types.") + args = parser.parse_args() - language = args.language or None + explicit_language = args.language or None entity_types = [s.strip() for s in args.entity_types.split(",")] if args.entity_types else None - chunks = _default_demo_chunks() if not args.chunks else _load_chunks_from_path(args.chunks) - client = Chat.singleton() - result = extract_from_chunks(chunks, language=language, entity_types=entity_types, client=client) - - # Pretty print + result = extract_from_chunks( + chunks, + language=explicit_language, + entity_types=entity_types, + client=Chat.singleton(), + audit_enabled=settings.extraction.audit_second_pass_enabled, + ) _print_summary(result) - - # Also dump JSON print("\n\nJSON Output:") - print(json.dumps(result, ensure_ascii=False, indent=2)) \ No newline at end of file + print(json.dumps(result, ensure_ascii=False, indent=2)) diff --git a/graph/fileparser.py b/graph/fileparser.py index ccc8757..198f23f 100644 --- a/graph/fileparser.py +++ b/graph/fileparser.py @@ -9,6 +9,7 @@ from typing import List, Tuple, Dict, Any, Union from langchain_community.document_loaders import BSHTMLLoader from langchain_community.document_loaders import TextLoader +from settings import VALID_EXTENSIONS import utils as ut import settings @@ -18,8 +19,8 @@ class FileParser: and returns pages and metadata in a standardized format. """ - def __init__(self): - pass + # Can we throw this one away? Yes? + SUPPORTED_EXTENSIONS = {ext.lower() for ext in VALID_EXTENSIONS} def parse_file(self, filepath: Union[str, Path]) -> Tuple[List[Tuple[int, str]], Dict[str, Any]]: """ @@ -53,10 +54,10 @@ def parse_file(self, filepath: Union[str, Path]) -> Tuple[List[Tuple[int, str]], metadata = { 'doc_id': str(uuid.uuid4()), 'filename': filepath.name, - 'filepath': str(filepath.absolute()), + 'filepath': str(filepath.resolve()), 'file_size': stat.st_size, 'last_modified': stat.st_mtime, - 'created': stat.st_birthtime, + 'created': getattr(stat, "st_birthtime", stat.st_ctime), 'extension': extension, 'mime_type': mimetypes.guess_type(str(filepath))[0] } @@ -83,9 +84,18 @@ def parse_file(self, filepath: Union[str, Path]) -> Tuple[List[Tuple[int, str]], def _parse_text_file(self, filepath: Path) -> Tuple[List[Tuple[int, str]], Dict[str, Any]]: """Parse text files (.txt)""" - loader = TextLoader(file_path=filepath, autodetect_encoding=True) - text = loader.load() - raw_text = text[0].page_content + encodings_to_try = ["utf-8", "utf-8-sig", "cp1252", "latin-1"] + raw_text = None + for encoding in encodings_to_try: + try: + raw_text = filepath.read_text(encoding=encoding) + break + except UnicodeDecodeError: + continue + if raw_text is None: + loader = TextLoader(file_path=filepath, autodetect_encoding=False, encoding="utf-8") + text = loader.load() + raw_text = text[0].page_content # txt files do not have multiple pages pages = [(0, raw_text)] # extract metadata @@ -220,4 +230,3 @@ def convert_docx_to_pdf(self, docx_path: str) -> str: convert(input_path=docx_path, output_path=pdf_path, keep_active=False) return pdf_path - diff --git a/graph/graph_pickle.py b/graph/graph_pickle.py new file mode 100644 index 0000000..ef9e62a --- /dev/null +++ b/graph/graph_pickle.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +import logging +import pickle +from pathlib import Path +from typing import Optional + +import networkx as nx + +from db_storage import Storage + + +def load_graph_from_pickle(path: Path) -> nx.Graph: + with path.open("rb") as handle: + graph = pickle.load(handle) + if not isinstance(graph, nx.Graph): + raise ValueError(f"Pickle at {path} does not contain a NetworkX graph.") + return graph + + +def save_graph_to_pickle(graph: nx.Graph, path: Path) -> Path: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("wb") as handle: + pickle.dump(graph, handle) + return path + + +def build_graph_from_storage( + storage: Storage, + *, + logger: Optional[logging.Logger] = None, +) -> nx.Graph: + graph = nx.Graph() + + with storage.graphdb.connect() as con: + node_rows = con.execute( + "SELECT name, type, description, source_id, filepath FROM nodes;" + ).fetchall() + edge_rows = con.execute( + "SELECT source_name, target_name, weight, description, keywords, " + "source_id, filepath FROM edges;" + ).fetchall() + + for name, type_, description, source_id, filepath in node_rows: + node_id = (name or "").strip() + if not node_id: + continue + + chunk_uuids = [] + if source_id: + chunk_uuids = [value.strip() for value in source_id.split("||") if value.strip()] + + graph.add_node( + node_id, + type=(type_ or "unknown").strip() or "unknown", + description=(description or "").strip(), + source_id=(source_id or "").strip(), + filepath=(filepath or "").strip(), + chunk_uuids=chunk_uuids, + ) + + for source, target, weight, description, keywords, source_id, filepath in edge_rows: + src_id = (source or "").strip() + tgt_id = (target or "").strip() + if not src_id or not tgt_id: + continue + if src_id not in graph or tgt_id not in graph: + if logger is not None: + logger.debug("Skipping edge with missing endpoints: %s -> %s", src_id, tgt_id) + continue + + chunk_uuids = [] + if source_id: + chunk_uuids = [value.strip() for value in source_id.split("||") if value.strip()] + + graph.add_edge( + src_id, + tgt_id, + weight=float(weight) if weight is not None else 1.0, + description=(description or "").strip(), + keywords=(keywords or "").strip(), + source_id=(source_id or "").strip(), + filepath=(filepath or "").strip(), + chunk_uuids=chunk_uuids, + ) + + if logger is not None: + logger.debug( + "Loaded graph snapshot with %d nodes and %d edges", + graph.number_of_nodes(), + graph.number_of_edges(), + ) + return graph + + +def load_or_build_graph_snapshot( + storage: Storage, + *, + snapshot_path: Optional[Path] = None, + logger: Optional[logging.Logger] = None, +) -> nx.Graph: + if snapshot_path is not None and snapshot_path.exists(): + try: + graph = load_graph_from_pickle(snapshot_path) + if logger is not None: + logger.debug("Loaded graph snapshot from pickle: %s", snapshot_path) + return graph + except Exception as exc: + if logger is not None: + logger.warning( + "Failed to load graph snapshot pickle at %s; rebuilding from SQLite (%s)", + snapshot_path, + exc, + ) + + graph = build_graph_from_storage(storage, logger=logger) + if snapshot_path is not None: + save_graph_to_pickle(graph, snapshot_path) + if logger is not None: + logger.debug("Saved graph snapshot pickle to %s", snapshot_path) + return graph diff --git a/graph/ingestion.py b/graph/ingestion.py index 834bde4..cb7e0ae 100644 --- a/graph/ingestion.py +++ b/graph/ingestion.py @@ -1,8 +1,9 @@ from __future__ import annotations +import json import mimetypes import uuid from pathlib import Path -from typing import Dict, Any, List, Sequence, Tuple, Optional +from typing import Callable, Dict, Any, List, Sequence, Tuple, Optional, Union from collections import Counter, defaultdict import os # local imports @@ -10,10 +11,26 @@ from fileparser import FileParser from chunker import chunk_parsed_pages from extractor import extract_from_chunks +from graph_pickle import save_graph_to_pickle, build_graph_from_storage from llm import llm_summarize_text -import settings +from settings import VALID_EXTENSIONS, settings +from logging_utils import configure_file_logger +from project_paths import ( + ProjectPaths, + ensure_project_dirs, + list_document_paths, + resolve_project_paths, +) import hashlib + +LOGGER = configure_file_logger( + "appl_kgraph.ingestion", + log_file=Path("ingestion.log"), + level=settings.logging.ingestion_level, + enabled=settings.logging.ingestion_enabled, +) + #-------------------------------------------------- # Helpers #-------------------------------------------------- @@ -104,6 +121,9 @@ def build_chunks( pages: Sequence[Tuple[int, str]], doc_id: str, filename: str, + *, + filepath: Optional[str] = None, + document_language: Optional[str] = None, ) -> List[Dict[str, Any]]: """ Normalize chunks for storage: @@ -126,6 +146,8 @@ def build_chunks( "doc_id": doc_id, "chunk_id": int(c.get("chunk_id", i)), "filename": filename, + "filepath": filepath, + "document_language": document_language, "text": text, "char_count": int(c.get("char_count", len(text))), "start_page": start, @@ -133,6 +155,56 @@ def build_chunks( }) return norm + +def _resolve_runtime_project_paths( + *, + paths: Sequence[Path], + documents_root: Optional[Union[Path, str]] = None, + project_paths: Optional[ProjectPaths] = None, +) -> Optional[ProjectPaths]: + if project_paths is not None: + return project_paths + if documents_root is not None: + return resolve_project_paths(documents_root) + return None + + +def _configure_ingestion_logger(project_paths: Optional[ProjectPaths]) -> None: + log_file = ( + project_paths.ingestion_log_file + if project_paths is not None + else Path("ingestion.log") + ) + configure_file_logger( + "appl_kgraph.ingestion", + log_file=log_file, + level=settings.logging.ingestion_level, + enabled=settings.logging.ingestion_enabled, + ) + + +def _write_extraction_audits( + project_paths: Optional[ProjectPaths], + filename: str, + audits: List[Dict[str, Any]], +) -> None: + if project_paths is None or not audits: + return + ensure_project_dirs(project_paths) + target = project_paths.extraction_audits_dir / f"{Path(filename).stem}.audit.json" + target.write_text(json.dumps(audits, ensure_ascii=False, indent=2), encoding="utf-8") + + +def _write_retrieval_graph_snapshot( + storage: Storage, + project_paths: Optional[ProjectPaths], +) -> Optional[Path]: + if project_paths is None: + return None + ensure_project_dirs(project_paths) + graph = build_graph_from_storage(storage, logger=LOGGER) + return save_graph_to_pickle(graph, project_paths.retrieval_graph_pickle_file) + def _resolve_type(votes: Counter, existing_type: str = "") -> str: existing = (existing_type or "").strip() if not votes: @@ -467,22 +539,22 @@ def remove_document_from_storage(storage: Storage, filename: str) -> None: # Get the document to retrieve its doc_id doc = storage.get_document_by_filename(filename) if not doc or not doc.get("doc_id"): - print(f"Document {filename} not found in storage.") + LOGGER.warning("Document %s not found in storage", filename) return doc_id = doc["doc_id"] - print(f"Removing document: {filename} (doc_id: {doc_id})") + LOGGER.info("Removing document %s (doc_id=%s)", filename, doc_id) # Step 1: Get all chunks associated with this document chunks = storage.get_chunks_by_doc_id(doc_id) if not chunks: - print(f"No chunks found for document {filename}") + LOGGER.info("No chunks found for document %s", filename) # Still proceed to delete the document itself storage.delete_document(doc_id) return chunk_uuids = [c["chunk_uuid"] for c in chunks] - print(f"Found {len(chunk_uuids)} chunks to process") + LOGGER.info("Found %d chunks to process for %s", len(chunk_uuids), filename) # Step 2: Process GraphDB - update nodes and edges delim = settings.settings.ingestion.delimiter @@ -532,7 +604,7 @@ def remove_document_from_storage(storage: Storage, filename: str) -> None: for n in nodes: name = n["name"] if n["remaining_source_ids"] < 0: - print(f"Warning: Node {name} has negative remaining_source_ids") + LOGGER.warning("Node %s has negative remaining_source_ids", name) elif n["remaining_source_ids"] == 0: nodes_to_delete.append(name) else: @@ -551,7 +623,7 @@ def remove_document_from_storage(storage: Storage, filename: str) -> None: for e in edges: edge_pair = (e["source_name"], e["target_name"]) if e["remaining_source_ids"] < 0: - print(f"Warning: Edge {edge_pair} has negative remaining_source_ids") + LOGGER.warning("Edge %s has negative remaining_source_ids", edge_pair) elif e["remaining_source_ids"] == 0: edges_to_delete.append(edge_pair) else: @@ -570,54 +642,54 @@ def remove_document_from_storage(storage: Storage, filename: str) -> None: # Apply graph updates if nodes_to_update: - print(f"Updating {len(nodes_to_update)} nodes with source_id changes") + LOGGER.info("Updating %d nodes with source_id changes", len(nodes_to_update)) storage.graphdb.update_nodes(nodes_to_update) if nodes_to_delete: - print(f"Deleting {len(nodes_to_delete)} nodes with no remaining source_ids") + LOGGER.info("Deleting %d nodes with no remaining source_ids", len(nodes_to_delete)) storage.delete_nodes(nodes_to_delete) if edges_to_update: - print(f"Updating {len(edges_to_update)} edges with source_id changes") + LOGGER.info("Updating %d edges with source_id changes", len(edges_to_update)) storage.graphdb.update_edges(edges_to_update) if edges_to_delete: - print(f"Deleting {len(edges_to_delete)} edges with no remaining source_ids") + LOGGER.info("Deleting %d edges with no remaining source_ids", len(edges_to_delete)) storage.delete_edges(edges_to_delete) # Step 3: Update EntityVectors - remove source_ids and delete if empty # We need to get all entities and check their metadata # Since we deleted nodes, we also need to delete their vectors if nodes_to_delete: - print(f"Removing {len(nodes_to_delete)} entity vectors") + LOGGER.info("Removing %d entity vectors", len(nodes_to_delete)) storage.delete_entity_vector(nodes_to_delete) # For updated nodes, we need to upsert them in the vector DB if nodes_to_update: - print(f"Updating {len(nodes_to_update)} entity vectors") + LOGGER.info("Updating %d entity vectors", len(nodes_to_update)) storage.upsert_entity_vector(nodes_to_update) # Step 4: Update RelationVectors - remove source_ids and delete if empty if edges_to_delete: - print(f"Removing {len(edges_to_delete)} relation vectors") + LOGGER.info("Removing %d relation vectors", len(edges_to_delete)) storage.delete_relation_vector(edges_to_delete) # For updated edges, we need to upsert them in the vector DB if edges_to_update: - print(f"Updating {len(edges_to_update)} relation vectors") + LOGGER.info("Updating %d relation vectors", len(edges_to_update)) storage.upsert_relation_vector(edges_to_update) # Step 5: Remove chunk vectors - print(f"Removing {len(chunk_uuids)} chunk vectors") + LOGGER.info("Removing %d chunk vectors", len(chunk_uuids)) for chunk_uuid in chunk_uuids: storage.delete_chunk_vector(chunk_uuid) # Step 6: Remove chunks from ChunksDB - print(f"Removing {len(chunk_uuids)} chunks from ChunksDB") + LOGGER.info("Removing %d chunks from ChunksDB", len(chunk_uuids)) storage.delete_chunks_by_uuids(chunk_uuids) # Step 7: Remove document from DocumentsDB - print("Removing document from DocumentsDB") + LOGGER.info("Removing document from DocumentsDB") storage.delete_document(doc_id) # Step 8: Sanity check @@ -637,15 +709,27 @@ def remove_document_from_storage(storage: Storage, filename: str) -> None: for e in edges_after: src, tgt = e.get("source_name"), e.get("target_name") if src not in node_names_after or tgt not in node_names_after: - print(f"Sanity Check Warning: Edge ({src}, {tgt}) exists without corresponding nodes after deletion.") + LOGGER.warning( + "Sanity check warning: edge (%s, %s) exists without corresponding nodes after deletion", + src, + tgt, + ) storage.delete_edges( [(src, tgt)] ) storage.delete_relation_vector( [(src, tgt)] ) - print(f"Removed edge ({src}, {tgt}) due to missing nodes.") + LOGGER.info("Removed edge (%s, %s) due to missing nodes", src, tgt) - print(f"Successfully removed document {filename} and all associated data") + LOGGER.info("Successfully removed document %s and all associated data", filename) -def ingest_paths(paths: List[Path]): +def ingest_paths( + paths: List[Path], + *, + documents_root: Optional[Union[Path, str]] = None, + project_paths: Optional[ProjectPaths] = None, + storage_paths=None, + audit_enabled: Optional[bool] = None, + progress_callback: Optional[Callable[[str], None]] = None, +) -> Dict[str, Any]: """ Ingests files from given paths into the knowledge graph storage system. @@ -658,39 +742,80 @@ def ingest_paths(paths: List[Path]): Returns: None """ - storage = Storage() + def report(message: str) -> None: + LOGGER.info(message) + if progress_callback is not None: + progress_callback(message) + + active_project_paths = _resolve_runtime_project_paths( + paths=paths, + documents_root=documents_root, + project_paths=project_paths, + ) + if active_project_paths is not None: + ensure_project_dirs(active_project_paths) + _configure_ingestion_logger(active_project_paths) + + effective_storage_paths = ( + active_project_paths.storage + if active_project_paths is not None + else storage_paths + ) + + report("Initializing project storage") + storage = Storage(paths=effective_storage_paths) storage.init() + LOGGER.info("Starting ingestion for %d paths", len(paths)) + report(f"Queued {len(paths)} files for ingestion") all_chunks: List[Dict[str, Any]] = [] all_entities: List[Dict[str, Any]] = [] all_relations: List[Dict[str, Any]] = [] + processed_files = 0 + skipped_files = 0 + removed_files = 0 + report("Scanning existing project documents") # Remove documents that are no longer present all_existing_docs = storage.get_all_documents() existing_filenames = {doc["filename"] for doc in all_existing_docs if doc.get("filename")} files_to_be_removed = existing_filenames - {p.name for p in paths} for fname in files_to_be_removed: + report(f"Removing stale document {fname}") remove_document_from_storage(storage, fname) + removed_files += 1 - for p in paths: - print(f"Processing file: {p}\n") + total_paths = len(paths) + for index, p in enumerate(paths, start=1): + step_prefix = f"{index}/{total_paths} {p.name}" + report(f"{step_prefix} - checking file") if not p.exists() or not p.is_file(): + report(f"{step_prefix} - skipped (path missing or not a file)") + skipped_files += 1 continue content_hash = file_sha256(p) if should_skip_ingestion(storage, p, content_hash): - print(f"Skipping {p.name} (unchanged).") + LOGGER.info("Skipping %s (unchanged)", p.name) + report(f"{step_prefix} - skipped (unchanged)") + skipped_files += 1 continue # skip temporary files created by ms word if ((p.name.lower().startswith("~$") and p.name.lower().endswith((".docx", ".doc"))) or (p.name.lower().endswith((".tmp", ".temp")) and "word" in p.name.lower())): - print(f"Skipping temporary file {p.name}.") + LOGGER.info("Skipping temporary file %s", p.name) + report(f"{step_prefix} - skipped (temporary file)") + skipped_files += 1 continue + report(f"{step_prefix} - parsing file") pages, file_meta = parse_to_pages(p) if not pages or not file_meta: - print(f"Skipping {p} due to parsing error.") + LOGGER.warning("Skipping %s due to parsing error", p) + report(f"{step_prefix} - parsing failed") + skipped_files += 1 continue doc_exists = storage.get_document_by_filename(p.name).get("filename") == p.name if storage.get_document_by_filename(p.name) else False if doc_exists: # document exists but content hash differs. + report(f"{step_prefix} - replacing changed document") remove_document_from_storage(storage, p.name) file_meta = normalize_metadata(file_meta) @@ -701,10 +826,10 @@ def ingest_paths(paths: List[Path]): doc_meta = { "doc_id": str(uuid.uuid4()), "filename": p.name, - "filepath": str(p), # keep if useful for tracing + "filepath": str(p.resolve()), "file_size": st.st_size, "last_modified": st.st_mtime, - "created": st.st_birthtime, + "created": getattr(st, "st_birthtime", st.st_ctime), "extension": p.suffix.lower(), "mime_type": ((file_meta or {}).get("mime_type") or mimetypes.guess_type(str(p))[0] or ""), "language": (file_meta or {}).get("language", "unknown"), @@ -712,15 +837,29 @@ def ingest_paths(paths: List[Path]): "full_char_count": len(full_text), } + report(f"{step_prefix} - storing document") storage.add_document(doc_meta, full_text) # from storage.py - chunks = build_chunks(pages, doc_meta["doc_id"], doc_meta["filename"]) + report(f"{step_prefix} - building chunks") + chunks = build_chunks( + pages, + doc_meta["doc_id"], + doc_meta["filename"], + filepath=doc_meta["filepath"], + document_language=doc_meta["language"], + ) + report(f"{step_prefix} - storing {len(chunks)} chunks") storage.add_chunks(chunks) # from storage.py all_chunks.extend(chunks) # Extract entities and relations from chunks # res['entities'], res['relationships'], res['content_keywords'] - res = extract_from_chunks(chunks) # from extractor.py + report(f"{step_prefix} - extracting entities and relations") + res = extract_from_chunks( + chunks, + storage=storage, + audit_enabled=audit_enabled, + ) # Consolidate/merge entities (by (name,type)) and upsert those first entities_in = res.get("entities", []) or [] @@ -731,46 +870,75 @@ def ingest_paths(paths: List[Path]): if placeholders: all_entities.extend(placeholders) # collect for vector DB later + report(f"{step_prefix} - merging graph data") nodes, edges = merge_graph_data(storage, entities_in, edges_in) if nodes: + report(f"{step_prefix} - writing {len(nodes)} entities") storage.upsert_nodes(nodes) # write schema all_entities.extend(nodes) # collect for vector DB later # Group/merge edges and upsert if edges: + report(f"{step_prefix} - writing {len(edges)} relations") storage.upsert_edges(edges) # write schema all_relations.extend(edges) # collect for vector DB later + if res.get("audits"): + report(f"{step_prefix} - writing extraction audit") + _write_extraction_audits(active_project_paths, p.name, res.get("audits", []) or []) + report(f"{step_prefix} - completed") + processed_files += 1 # Finally, add all chunks, entities, and relations to vector DB if all_chunks: + report("Writing chunk vectors") storage.upsert_chunk_vector(all_chunks) # from storage.py deduped_entities = dedupe_entities_for_vectors(all_entities) - # if all_chunks: - # print(f"[ingestion] sample chunk: {all_chunks[0]}") if deduped_entities: + report("Writing entity vectors") storage.upsert_entity_vector(deduped_entities) if all_relations: + report("Writing relation vectors") storage.upsert_relation_vector(all_relations) + report("Writing retrieval snapshot") + retrieval_snapshot = _write_retrieval_graph_snapshot(storage, active_project_paths) + report("Completed ingestion") + LOGGER.info( + "Completed ingestion: processed=%d skipped=%d removed=%d chunks=%d entities=%d relations=%d snapshot=%s", + processed_files, + skipped_files, + removed_files, + len(all_chunks), + len(all_entities), + len(all_relations), + retrieval_snapshot, + ) + return { + "documents_root": str(active_project_paths.documents_root) if active_project_paths else None, + "project_root": str(active_project_paths.project_root) if active_project_paths else None, + "retrieval_graph_pickle": str(retrieval_snapshot) if retrieval_snapshot is not None else None, + "processed_files": processed_files, + "skipped_files": skipped_files, + "removed_files": removed_files, + "chunk_count": len(all_chunks), + "entity_count": len(all_entities), + "relation_count": len(all_relations), + } def main(): - # root = Path('docs') - fileparser = FileParser() - # Get source folder with docs from user - content_folder_path = input("Source folder path of documents (including path): ") - if not content_folder_path or not os.path.isdir(content_folder_path): - print("Please enter a valid folder path.") + folder_input = input( + "Source folder path of documents (leave blank for ./docs): " + ).strip() + root = Path(folder_input) if folder_input else Path("docs") + paths = list_document_paths(root) + if not paths: + print( + f"No supported files found in {root.resolve()} " + f"with extensions: {', '.join(VALID_EXTENSIONS)}." + ) return - else: - paths = [Path(os.path.join(content_folder_path, f)) for f in os.listdir(content_folder_path) - if ((os.path.isfile(os.path.join(content_folder_path, f))) and (Path(f).suffix in settings.VALID_EXTENSIONS))] - if not paths: - print(f"📂 No files found with extensions: {', '.join(settings.VALID_EXTENSIONS)}.") - return - else: - ingest_paths(paths) - + ingest_paths(paths, documents_root=root) if __name__ == "__main__": main() diff --git a/graph/lightrag.py b/graph/lightrag.py index ba21462..b1cfb4e 100644 --- a/graph/lightrag.py +++ b/graph/lightrag.py @@ -7,6 +7,7 @@ from dataclasses import dataclass from functools import lru_cache from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple +from pathlib import Path import networkx as nx import tiktoken @@ -17,6 +18,10 @@ from db_storage import StoragePaths from llm import Chat from prompts import PROMPTS +from logging_utils import configure_file_logger +from graph_pickle import load_or_build_graph_snapshot +from project_paths import ProjectPaths +from query_logging import write_query_log LOGGER = logging.getLogger("LightRAG") @@ -72,13 +77,13 @@ def render_full_context(result: RetrievalResult) -> str: # Logging and token helpers # --------------------------------------------------------------------------- -def set_logger(log_file: str) -> None: - """Configure the package wide logger.""" - LOGGER.setLevel(logging.INFO) - handler = logging.FileHandler(log_file) - handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) - if not LOGGER.handlers: - LOGGER.addHandler(handler) +def set_logger(log_file: Path) -> None: + configure_file_logger( + "LightRAG", + log_file=log_file, + level=settings.logging.retrieval_level, + enabled=settings.logging.retrieval_enabled, + ) @lru_cache(maxsize=4) @@ -164,10 +169,16 @@ class GraphSnapshot: class StorageAdapter: """High level helper around the ingestion Storage facade.""" - def __init__(self, paths: Optional[StoragePaths] = None): + def __init__( + self, + paths: Optional[StoragePaths] = None, + *, + graph_pickle_path: Optional[Path] = None, + ): self._storage = Storage(paths=paths) self._storage.init() self._graph_snapshot: Optional[GraphSnapshot] = None + self._graph_pickle_path = graph_pickle_path @property def graph(self) -> nx.Graph: @@ -180,156 +191,66 @@ def refresh_graph(self) -> None: self._graph_snapshot = self._load_graph() def _load_graph(self) -> GraphSnapshot: - """Load graph from storage into NetworkX.""" - graph = nx.Graph() - - with self._storage.graphdb.connect() as con: - node_rows = con.execute( - "SELECT name, type, description, source_id, filepath FROM nodes;" - ).fetchall() - edge_rows = con.execute( - "SELECT source_name, target_name, weight, description, keywords, " - "source_id, filepath FROM edges;" - ).fetchall() - - # Add nodes with chunk_uuid list - for name, type_, description, source_id, filepath in node_rows: - node_id = (name or "").strip() - if not node_id: - continue - - # Parse source_id to get chunk_uuids - chunk_uuids = [] - if source_id: - chunk_uuids = [s.strip() for s in source_id.split("||") if s.strip()] - - graph.add_node( - node_id, - type=(type_ or "unknown").strip() or "unknown", - description=(description or "").strip(), - source_id=(source_id or "").strip(), - filepath=(filepath or "").strip(), - chunk_uuids=chunk_uuids, - ) - - # Add edges with chunk_uuid list - for row in edge_rows: - source, target, weight, description, keywords, source_id, filepath = row - src_id = (source or "").strip() - tgt_id = (target or "").strip() - if not src_id or not tgt_id: - continue - if src_id not in graph or tgt_id not in graph: - LOGGER.debug("Skipping edge with missing endpoints: %s -> %s", src_id, tgt_id) - continue - - # Parse source_id to get chunk_uuids - chunk_uuids = [] - if source_id: - chunk_uuids = [s.strip() for s in source_id.split("||") if s.strip()] - - graph.add_edge( - src_id, - tgt_id, - weight=float(weight) if weight is not None else 1.0, - description=(description or "").strip(), - keywords=(keywords or "").strip(), - source_id=(source_id or "").strip(), - filepath=(filepath or "").strip(), - chunk_uuids=chunk_uuids, - ) - - LOGGER.debug( - "Loaded graph snapshot with %d nodes and %d edges", - graph.number_of_nodes(), - graph.number_of_edges(), + graph = load_or_build_graph_snapshot( + self._storage, + snapshot_path=self._graph_pickle_path, + logger=LOGGER, ) return GraphSnapshot(graph=graph) def query_entities(self, text: str, limit: int = 5) -> List[Dict[str, Any]]: """Query entity vector index and return entity information with IDs.""" - results = self._storage.entity_vectors.query(text=text, n_results=limit) or [] + results = self._storage.search_entities(text=text, n_results=limit) or [] matches: List[Dict[str, Any]] = [] if not results: return matches - ids = results[0].get("ids", []) - metadatas = results[0].get("metadatas", []) - distances = results[0].get("distances", []) - - for i, metadata in enumerate(metadatas): + for metadata in results: if not isinstance(metadata, dict): continue - entity_id = ids[i] if i < len(ids) else "" - distance = distances[i] if i < len(distances) else None matches.append({ - "id": entity_id, - "name": metadata.get("name", entity_id), + "id": metadata.get("name", ""), + "name": metadata.get("name", ""), "type": metadata.get("type"), "description": metadata.get("description", ""), - "score": _distance_to_similarity(distance), + "score": float(metadata.get("score", 0.0) or 0.0), }) return matches def query_relations(self, text: str, limit: int = 5) -> List[Dict[str, Any]]: """Query relation vector index and return relation information with IDs.""" - results = self._storage.relation_vectors.query(text=text, n_results=limit) or [] + results = self._storage.search_relations(text=text, n_results=limit) or [] matches: List[Dict[str, Any]] = [] if not results: return matches - ids = results[0].get("ids", []) - metadatas = results[0].get("metadatas", []) - distances = results[0].get("distances", []) - - for i, metadata in enumerate(metadatas): + for metadata in results: if not isinstance(metadata, dict): continue - relation_id = ids[i] if i < len(ids) else "" - distance = distances[i] if i < len(distances) else None matches.append({ - "id": relation_id, + "id": f"{metadata.get('source_name', '')}::{metadata.get('target_name', '')}", "source_name": metadata.get("source_name", ""), "target_name": metadata.get("target_name", ""), "description": metadata.get("description", ""), "keywords": metadata.get("keywords", ""), - "score": _distance_to_similarity(distance), + "score": float(metadata.get("score", 0.0) or 0.0), }) return matches def query_chunks(self, text: str, limit: int = 5) -> List[Dict[str, Any]]: """Query chunk vector index and return chunk information.""" - results = self._storage.chunk_vectors.query(text=text, n_results=limit) or [] + results = self._storage.search_chunks(text=text, n_results=limit) or [] matches: List[Dict[str, Any]] = [] - - for result in results: - metadatas = self._as_list(result.get("metadatas")) - ids = self._as_list(result.get("ids")) - distances = self._as_list(result.get("distances")) - documents = self._as_list(result.get("documents")) - - max_len = max( - (len(seq) for seq in (metadatas, ids, distances, documents) if seq), - default=0, - ) - - for index in range(max_len): - metadata = metadatas[index] if index < len(metadatas) else {} - if not isinstance(metadata, dict): - metadata = {} - chunk_id = ids[index] if index < len(ids) else "" - distance = distances[index] if index < len(distances) else None - document = documents[index] if index < len(documents) else "" - if not isinstance(document, str): - document = str(document or "") - - matches.append({ - "chunk_uuid": str(chunk_id), - "document_id": str(metadata.get("doc_id", "")), - "filename": str(metadata.get("filename", "")), - "text": document, - "score": _distance_to_similarity(distance), - }) + for metadata in results: + if not isinstance(metadata, dict): + continue + matches.append({ + "chunk_uuid": str(metadata.get("chunk_uuid", "")), + "document_id": str(metadata.get("doc_id", "")), + "filename": str(metadata.get("filename", "")), + "text": str(metadata.get("text", "")), + "score": float(metadata.get("score", 0.0) or 0.0), + }) return matches def get_chunk_by_uuid(self, chunk_uuid: str) -> Optional[Dict[str, Any]]: @@ -418,7 +339,7 @@ async def extract_keywords( Tuple of (hl_keywords, ll_keywords) """ examples = "\n".join(PROMPTS["keywords_extraction_examples"]) - language = PROMPTS["DEFAULT_LANGUAGE"] + language = settings.prompts.default_language history_context = get_conversation_turns(conversation_history, history_turns) prompt = PROMPTS["keywords_extraction"].format( @@ -652,6 +573,9 @@ def get_vector_context( for chunk in chunk_matches: all_chunks.append({ "id": chunk["chunk_uuid"], + "chunk_uuid": chunk["chunk_uuid"], + "document_id": chunk.get("document_id", ""), + "filename": chunk.get("filename", ""), "text": chunk["text"], "source_type": "vector", "score": chunk.get("score", 0.0), @@ -708,6 +632,9 @@ def extract_chunks_from_nodes( if chunk: result_chunks.append({ "id": chunk["chunk_uuid"], + "chunk_uuid": chunk["chunk_uuid"], + "document_id": chunk.get("doc_id", ""), + "filename": chunk.get("filename", ""), "text": chunk.get("text", ""), "order": item["index"], "relation": item["relation_score"], @@ -749,6 +676,9 @@ def extract_chunks_from_edges( if chunk: result_chunks.append({ "id": chunk["chunk_uuid"], + "chunk_uuid": chunk["chunk_uuid"], + "document_id": chunk.get("doc_id", ""), + "filename": chunk.get("filename", ""), "text": chunk.get("text", ""), "order": index, "source_type": "relationship", @@ -946,10 +876,15 @@ def lightrag_prompt( context = naive_context if settings.retrieval.light_mode == "naive" else kg_context + naive_context history_context = get_conversation_turns(history) - user_prompt = PROMPTS["DEFAULT_USER_PROMPT"] - sys_prompt_template = PROMPTS["lightrag_response"] if settings.retrieval.light_mode != "naive" else PROMPTS["rag_response_naive"] + user_prompt = settings.prompts.default_user_prompt + sys_prompt_template = ( + PROMPTS["lightrag_response"] + if settings.retrieval.light_mode != "naive" + else PROMPTS["naive_rag_response"] + ) sys_prompt = sys_prompt_template.format( context_data=context, + content_data=context, response_type=settings.retrieval.response_type, history=history_context, user_prompt=user_prompt, @@ -958,6 +893,25 @@ def lightrag_prompt( return sys_prompt +def _retrieval_model_metadata() -> Dict[str, Any]: + provider = settings.provider.provider + if provider == "azure": + return { + "provider": provider, + "model_name": settings.provider.azure_llm_deployment, + "model_version": settings.provider.azure_api_version, + "api_version": settings.provider.azure_api_version, + "endpoint": settings.provider.azure_endpoint, + } + return { + "provider": provider, + "model_name": settings.provider.openai_llm_model, + "model_version": settings.provider.openai_llm_model, + "api_version": "", + "endpoint": settings.provider.openai_base_url or "https://api.openai.com/v1", + } + + # --------------------------------------------------------------------------- # LightRAG entry point # --------------------------------------------------------------------------- @@ -975,12 +929,35 @@ def __init__( self, *, storage_paths: Optional[StoragePaths] = None, + project_paths: Optional[ProjectPaths] = None, system_prompt: Optional[str] = None, - log_file: str = "LightRAG.log", + log_file: Optional[str] = None, ) -> None: - set_logger(log_file) + self._project_paths = project_paths + effective_storage_paths = ( + storage_paths + if storage_paths is not None + else (project_paths.storage if project_paths is not None else None) + ) + effective_log_file = ( + Path(log_file) + if log_file is not None + else ( + project_paths.lightrag_log_file + if project_paths is not None + else Path("LightRAG.log") + ) + ) + set_logger(effective_log_file) LOGGER.info("Initialising LightRAG retriever") - self._storage = StorageAdapter(paths=storage_paths) + self._storage = StorageAdapter( + paths=effective_storage_paths, + graph_pickle_path=( + project_paths.retrieval_graph_pickle_file + if project_paths is not None + else None + ), + ) self._chat = RetrieveChat(system_prompt=system_prompt) async def aretrieve( @@ -1004,7 +981,14 @@ async def aretrieve( # Handle empty keywords if hl_keywords == [] and ll_keywords == []: LOGGER.warning("low_level_keywords and high_level_keywords is empty") - return PROMPTS["fail_response"] + return RetrievalResult( + answer=PROMPTS["fail_response"], + entities_context=[], + relations_context=[], + all_chunks=[], + hl_keywords=[], + ll_keywords=[], + ) if ll_keywords == [] and retrieval_mode in ["local", "hybrid"]: LOGGER.warning(f"low_level_keywords is empty, switching from {retrieval_mode} mode to global mode") retrieval_mode = "global" @@ -1041,7 +1025,7 @@ async def aretrieve( temperature=settings.retrieval.llm_temperature, ) - return RetrievalResult( + result = RetrievalResult( answer=answer, entities_context=entities_context, relations_context=relations_context, @@ -1049,6 +1033,33 @@ async def aretrieve( hl_keywords=hl_keywords, ll_keywords=ll_keywords, ) + if settings.logging.qa_enabled: + write_query_log( + project_paths=self._project_paths, + retriever_name="lightrag", + payload={ + "question": question, + "answer": answer, + "active_documents_root": str(self._project_paths.documents_root) + if self._project_paths + else None, + "conversation_history": conversation_history or [], + "retrieval_metadata": { + "retrieval_mode": retrieval_mode, + "response_type": settings.retrieval.response_type, + "entity_top_k": settings.retrieval.entity_top_k, + "relation_top_k": settings.retrieval.relation_top_k, + "chunk_top_k": settings.retrieval.chunk_top_k, + }, + "model": _retrieval_model_metadata(), + "high_level_keywords": hl_keywords, + "low_level_keywords": ll_keywords, + "retrieved_entities": entities_context, + "retrieved_relationships": relations_context, + "retrieved_chunks": all_chunks, + }, + ) + return result def retrieve( self, diff --git a/graph/logging_utils.py b/graph/logging_utils.py new file mode 100644 index 0000000..64967d6 --- /dev/null +++ b/graph/logging_utils.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import logging +from pathlib import Path + + +def configure_file_logger( + name: str, + *, + log_file: Path, + level: str = "INFO", + enabled: bool = True, +) -> logging.Logger: + logger = logging.getLogger(name) + logger.setLevel(getattr(logging, (level or "INFO").upper(), logging.INFO)) + logger.propagate = False + + handler_key = "appl_kgraph_managed" + for handler in list(logger.handlers): + if getattr(handler, handler_key, False): + logger.removeHandler(handler) + handler.close() + + if enabled: + log_file.parent.mkdir(parents=True, exist_ok=True) + handler = logging.FileHandler(log_file, encoding="utf-8") + handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) + setattr(handler, handler_key, True) + logger.addHandler(handler) + + return logger diff --git a/graph/main.py b/graph/main.py index 802d666..1e86348 100644 --- a/graph/main.py +++ b/graph/main.py @@ -1,87 +1,92 @@ -import asyncio -from pathlib import Path -from typing import Optional -# local imports -from ingestion import ingest_paths -from pathrag import PathRAG, render_full_context -from fileparser import FileParser -from lightrag import RetrievalResult - - -async def ask_with_pathrag(question: str, verbose: bool = False, conversation_history=None) -> None: - """ - Asks a question using PathRAG retrieval and prints the answer with context. - - Args: - question (str): The question to ask. - verbose (bool, optional): If True, displays full context details. Defaults to False. - conversation_history (optional): List of (role, text) tuples for conversation history. - - Returns: - None - """ - rag = PathRAG( - system_prompt="" - ) - result = await rag.aretrieve(question, conversation_history=conversation_history) - print("Answer:\n", result.answer) - - print(render_full_context(result) if verbose else "") - if not verbose: - for window in result.context_windows: - print(f"\n[{window.label}] score={window.score:.2f}\n{window.text}") - - -async def ask_with_lightrag(question: str, verbose: Optional[bool] = False, history: Optional[list] = None) -> RetrievalResult: - """ - Asks a question using LightRAG retrieval and prints the answer with context. - - Args: - question (str): The question to ask. - verbose (bool, optional): If True, displays full context details. Defaults to False. - history (list, optional): Conversation history. Defaults to None. - - Returns: - Result object containing the answer and context. - """ - from lightrag import LightRAG - from lightrag import render_full_context - rag = LightRAG( - system_prompt="" - ) - result = await rag.aretrieve(question, conversation_history=history) - print("Answer:\n", result.answer) - - print(render_full_context(result) if verbose else "") - - return result - - -def main(): - """ - Main entry point for document ingestion and Q&A demonstration. - - Ingests documents from the 'docs' directory and runs a sample PathRAG query. - """ - root = Path('docs') - paths = FileParser(root).filepaths - if not paths: - print("No files to ingest.") - return - ingest_paths(paths) - # query = "Who are the authors of LayoutParser and do they overlap any of the other articles?" - query = input("Enter your question: ") - conversation_history = [] # List[Tuple[str, str]] with role in {"user", "assistant"} - while query not in ("exit", "quit"): - print("\n--- PathRAG Response ---\n") - asyncio.run(ask_with_pathrag(query, verbose=True, conversation_history=conversation_history)) - print("\n---\n") - print("\n--- LightRAG Response ---\n") - result = asyncio.run(ask_with_lightrag(query, verbose=True, history=conversation_history)) - conversation_history.append(("user", query)) - conversation_history.append(("assistant", result.answer)) - print("\n---\n") - query = input("Enter your next question: ") - -if __name__ == "__main__": - main() +from __future__ import annotations + +import argparse +import asyncio +from pathlib import Path +from typing import List, Optional, Tuple + +from ingestion import ingest_paths +from lightrag import LightRAG, RetrievalResult +from pathrag import PathRAG, render_full_context +from project_paths import list_document_paths, resolve_project_paths + + +async def ask_with_pathrag( + question: str, + *, + documents_root: Path, + verbose: bool = False, + conversation_history: Optional[List[Tuple[str, str]]] = None, +) -> None: + project_paths = resolve_project_paths(documents_root) + rag = PathRAG(project_paths=project_paths, system_prompt="") + result = await rag.aretrieve(question, conversation_history=conversation_history) + print("Answer:\n", result.answer) + if verbose: + print(render_full_context(result)) + elif result.context_windows: + for window in result.context_windows: + print(f"\n[{window.label}] score={window.score:.2f}\n{window.text}") + + +async def ask_with_lightrag( + question: str, + *, + documents_root: Path, + verbose: bool = False, + history: Optional[List[Tuple[str, str]]] = None, +) -> RetrievalResult: + project_paths = resolve_project_paths(documents_root) + rag = LightRAG(project_paths=project_paths, system_prompt="") + result = await rag.aretrieve(question, conversation_history=history) + print("Answer:\n", result.answer) + if verbose: + from lightrag import render_full_context as render_lightrag_context + + print(render_lightrag_context(result)) + return result + + +def main() -> None: + parser = argparse.ArgumentParser(description="Ingest a document folder and query the project-scoped RAG stores.") + parser.add_argument("documents_root", nargs="?", default="docs", help="Folder containing documents to ingest and query.") + args = parser.parse_args() + + documents_root = Path(args.documents_root).expanduser().resolve() + paths = list_document_paths(documents_root) + if not paths: + print("No files to ingest.") + return + + ingest_paths(paths, documents_root=documents_root) + + conversation_history: List[Tuple[str, str]] = [] + query = input("Enter your question: ") + while query not in ("exit", "quit"): + print("\n--- PathRAG Response ---\n") + asyncio.run( + ask_with_pathrag( + query, + documents_root=documents_root, + verbose=True, + conversation_history=conversation_history, + ) + ) + print("\n---\n") + print("\n--- LightRAG Response ---\n") + result = asyncio.run( + ask_with_lightrag( + query, + documents_root=documents_root, + verbose=True, + history=conversation_history, + ) + ) + conversation_history.append(("user", query)) + conversation_history.append(("assistant", result.answer)) + print("\n---\n") + query = input("Enter your next question: ") + + +if __name__ == "__main__": + main() diff --git a/graph/pathrag.py b/graph/pathrag.py index 675667b..b960027 100644 --- a/graph/pathrag.py +++ b/graph/pathrag.py @@ -1,25 +1,30 @@ """Single-file PathRAG retriever integrated with the ingestion storage backend.""" from __future__ import annotations -import asyncio -import logging -import json -from dataclasses import dataclass -from functools import lru_cache -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple -from collections import defaultdict - -import networkx as nx -import tiktoken - -from settings import settings -from db_storage import Storage -from db_storage import StoragePaths -from llm import Chat -from prompts import PROMPTS - - -LOGGER = logging.getLogger("PathRAG") +import asyncio +import logging +import json +from dataclasses import dataclass +from functools import lru_cache +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple +from collections import defaultdict +from pathlib import Path + +import networkx as nx +import tiktoken + +from settings import settings +from db_storage import Storage +from db_storage import StoragePaths +from llm import Chat +from prompts import PROMPTS +from logging_utils import configure_file_logger +from graph_pickle import load_or_build_graph_snapshot +from project_paths import ProjectPaths +from query_logging import write_query_log + + +LOGGER = logging.getLogger("PathRAG") # --------------------------------------------------------------------------- # Verbosity helper @@ -82,14 +87,13 @@ def render_full_context(result: RetrievalResult) -> str: # Logging and token helpers # --------------------------------------------------------------------------- -def set_logger(log_file: str) -> None: - """Configure the package wide logger.""" - - LOGGER.setLevel(logging.INFO) - handler = logging.FileHandler(log_file) - handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) - if not LOGGER.handlers: - LOGGER.addHandler(handler) +def set_logger(log_file: Path) -> None: + configure_file_logger( + "PathRAG", + log_file=log_file, + level=settings.logging.retrieval_level, + enabled=settings.logging.retrieval_enabled, + ) @lru_cache(maxsize=4) @@ -373,14 +377,20 @@ class GraphSnapshot: # --------------------------------------------------------------------------- -class StorageAdapter: - """High level helper around the ingestion ``Storage`` facade.""" - - def __init__(self, paths: Optional[StoragePaths] = None): - self._storage = Storage(paths=paths) - # Ensure tables exist before we start querying them. - self._storage.init() - self._graph_snapshot: Optional[GraphSnapshot] = None +class StorageAdapter: + """High level helper around the ingestion ``Storage`` facade.""" + + def __init__( + self, + paths: Optional[StoragePaths] = None, + *, + graph_pickle_path: Optional[Path] = None, + ): + self._storage = Storage(paths=paths) + # Ensure tables exist before we start querying them. + self._storage.init() + self._graph_snapshot: Optional[GraphSnapshot] = None + self._graph_pickle_path = graph_pickle_path # ------------------------------------------------------------------ # Graph helpers @@ -396,55 +406,13 @@ def refresh_graph(self) -> None: self._graph_snapshot = self._load_graph() - def _load_graph(self) -> GraphSnapshot: - graph = nx.Graph() - - with self._storage.graphdb.connect() as con: - node_rows = con.execute( - "SELECT name, type, description, source_id, filepath FROM nodes;" - ).fetchall() - edge_rows = con.execute( - "SELECT source_name, target_name, weight, description, keywords, " - "source_id, filepath FROM edges;" - ).fetchall() - - for name, type_, description, source_id, filepath in node_rows: - node_id = (name or "").strip() - if not node_id: - continue - graph.add_node( - node_id, - type=(type_ or "unknown").strip() or "unknown", - description=(description or "").strip(), - source_id=(source_id or "").strip(), - filepath=(filepath or "").strip(), - ) - - for row in edge_rows: - source, target, weight, description, keywords, source_id, filepath = row - src_id = (source or "").strip() - tgt_id = (target or "").strip() - if not src_id or not tgt_id: - continue - if src_id not in graph or tgt_id not in graph: - LOGGER.debug("Skipping edge with missing endpoints: %s -> %s", src_id, tgt_id) - continue - graph.add_edge( - src_id, - tgt_id, - weight=float(weight) if weight is not None else 1.0, - description=(description or "").strip(), - keywords=(keywords or "").strip(), - source_id=(source_id or "").strip(), - filepath=(filepath or "").strip(), - ) - - LOGGER.debug( - "Loaded graph snapshot with %d nodes and %d edges", - graph.number_of_nodes(), - graph.number_of_edges(), - ) - return GraphSnapshot(graph=graph) + def _load_graph(self) -> GraphSnapshot: + graph = load_or_build_graph_snapshot( + self._storage, + snapshot_path=self._graph_pickle_path, + logger=LOGGER, + ) + return GraphSnapshot(graph=graph) def get_node(self, name: str) -> Optional[Dict[str, Any]]: node_name = name.strip() @@ -476,95 +444,64 @@ def _as_list(value: Any) -> List[Any]: return list(value) return [value] - def query_entities(self, text: str, limit: int = 5) -> List[EntityMatch]: - """Query entity vector index and return EntityMatch objects.""" - results = self._storage.entity_vectors.query(text=text, n_results=limit) or [] - matches: List[EntityMatch] = [] - if not results: - return matches - - # Chroma returns a single dict with lists - ids = results[0].get("ids", []) - metadatas = results[0].get("metadatas", []) - distances = results[0].get("distances", []) - - for i, metadata in enumerate(metadatas): - if not isinstance(metadata, dict): - continue - entity_id = ids[i] if i < len(ids) else "" - distance = distances[i] if i < len(distances) else None - matches.append( - EntityMatch( - name=metadata.get("name", entity_id), - type=metadata.get("type"), - description=metadata.get("description", ""), - score=_distance_to_similarity(distance), - ) - ) - return matches - - def query_relations(self, text: str, limit: int = 5) -> List[RelationMatch]: - """Query relation vector index and return RelationMatch objects.""" - results = self._storage.relation_vectors.query(text=text, n_results=limit) or [] - matches: List[RelationMatch] = [] - if not results: - return matches - - ids = results[0].get("ids", []) - metadatas = results[0].get("metadatas", []) - distances = results[0].get("distances", []) - - for i, metadata in enumerate(metadatas): - if not isinstance(metadata, dict): - continue - distance = distances[i] if i < len(distances) else None - matches.append( - RelationMatch( - source_name=metadata.get("source_name", ""), - target_name=metadata.get("target_name", ""), - description=metadata.get("description", ""), - keywords=metadata.get("keywords", ""), - score=_distance_to_similarity(distance), - ) - ) - return matches - - - def query_chunks(self, text: str, limit: int = 5) -> List[ChunkMatch]: - results = self._storage.chunk_vectors.query(text=text, n_results=limit) or [] - matches: List[ChunkMatch] = [] - for result in results: - metadatas = self._as_list(result.get("metadatas")) - ids = self._as_list(result.get("ids")) - distances = self._as_list(result.get("distances")) - documents = self._as_list(result.get("documents")) - max_len = max( - ( - len(seq) - for seq in (metadatas, ids, distances, documents) - if seq - ), - default=0, - ) - for index in range(max_len): - metadata = metadatas[index] if index < len(metadatas) else {} - if not isinstance(metadata, dict): - metadata = {} - chunk_id = ids[index] if index < len(ids) else "" - distance = distances[index] if index < len(distances) else None - document = documents[index] if index < len(documents) else "" - if not isinstance(document, str): - document = str(document or "") - matches.append( - ChunkMatch( - chunk_uuid=str(chunk_id), - document_id=str(metadata.get("doc_id", "")), - filename=str(metadata.get("filename", "")), - text=document, - score=_distance_to_similarity(distance), - ) - ) - return matches + def query_entities(self, text: str, limit: int = 5) -> List[EntityMatch]: + """Query entity vector index and return EntityMatch objects.""" + results = self._storage.search_entities(text=text, n_results=limit) or [] + matches: List[EntityMatch] = [] + if not results: + return matches + + for metadata in results: + if not isinstance(metadata, dict): + continue + matches.append( + EntityMatch( + name=metadata.get("name", ""), + type=metadata.get("type"), + description=metadata.get("description", ""), + score=float(metadata.get("score", 0.0) or 0.0), + ) + ) + return matches + + def query_relations(self, text: str, limit: int = 5) -> List[RelationMatch]: + """Query relation vector index and return RelationMatch objects.""" + results = self._storage.search_relations(text=text, n_results=limit) or [] + matches: List[RelationMatch] = [] + if not results: + return matches + + for metadata in results: + if not isinstance(metadata, dict): + continue + matches.append( + RelationMatch( + source_name=metadata.get("source_name", ""), + target_name=metadata.get("target_name", ""), + description=metadata.get("description", ""), + keywords=metadata.get("keywords", ""), + score=float(metadata.get("score", 0.0) or 0.0), + ) + ) + return matches + + + def query_chunks(self, text: str, limit: int = 5) -> List[ChunkMatch]: + results = self._storage.search_chunks(text=text, n_results=limit) or [] + matches: List[ChunkMatch] = [] + for metadata in results: + if not isinstance(metadata, dict): + continue + matches.append( + ChunkMatch( + chunk_uuid=str(metadata.get("chunk_uuid", "")), + document_id=str(metadata.get("doc_id", "")), + filename=str(metadata.get("filename", "")), + text=str(metadata.get("text", "")), + score=float(metadata.get("score", 0.0) or 0.0), + ) + ) + return matches # ------------------------------------------------------------------ # Convenience helpers @@ -772,10 +709,29 @@ def build_context_for_prompt( # --------------------------------------------------------------------------- # !! lightweight placeholder for testing, replace with imported PROMPT when finished. -RAG_PROMPT = ( +RAG_PROMPT = ( "You are a helpful assistant. Use the supplied context to answer the question." "\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer in Markdown." -) +) + + +def _retrieval_model_metadata() -> Dict[str, Any]: + provider = settings.provider.provider + if provider == "azure": + return { + "provider": provider, + "model_name": settings.provider.azure_llm_deployment, + "model_version": settings.provider.azure_api_version, + "api_version": settings.provider.azure_api_version, + "endpoint": settings.provider.azure_endpoint, + } + return { + "provider": provider, + "model_name": settings.provider.openai_llm_model, + "model_version": settings.provider.openai_llm_model, + "api_version": "", + "endpoint": settings.provider.openai_base_url or "https://api.openai.com/v1", + } # --------------------------------------------------------------------------- @@ -790,17 +746,40 @@ class PathRAG: result = rag.retrieve("What is the capital of France?") """ - def __init__( - self, - *, - storage_paths: Optional[StoragePaths] = None, - system_prompt: Optional[str] = None, - log_file: str = "PathRAG.log", - ) -> None: - set_logger(log_file) - LOGGER.info("Initialising PathRAG retriever") - self._storage = StorageAdapter(paths=storage_paths) - self._chat = RetrieveChat(system_prompt=system_prompt) + def __init__( + self, + *, + storage_paths: Optional[StoragePaths] = None, + project_paths: Optional[ProjectPaths] = None, + system_prompt: Optional[str] = None, + log_file: Optional[str] = None, + ) -> None: + self._project_paths = project_paths + effective_storage_paths = ( + storage_paths + if storage_paths is not None + else (project_paths.storage if project_paths is not None else None) + ) + effective_log_file = ( + Path(log_file) + if log_file is not None + else ( + project_paths.pathrag_log_file + if project_paths is not None + else Path("PathRAG.log") + ) + ) + set_logger(effective_log_file) + LOGGER.info("Initialising PathRAG retriever") + self._storage = StorageAdapter( + paths=effective_storage_paths, + graph_pickle_path=( + project_paths.retrieval_graph_pickle_file + if project_paths is not None + else None + ), + ) + self._chat = RetrieveChat(system_prompt=system_prompt) # ------------------------------------------------------------------ # Retrieval entry points @@ -930,18 +909,74 @@ def build_local_windows( user_prompt=question, ) - answer = await self._chat.generate( - prompt, - max_tokens=settings.retrieval.llm_max_tokens, - temperature=settings.retrieval.llm_temperature, - ) - return RetrievalResult( - answer=answer, - context_windows=context_windows, - entity_matches=entity_matches, - relation_matches=relation_matches, - chunk_matches=chunk_matches, - ) + answer = await self._chat.generate( + prompt, + max_tokens=settings.retrieval.llm_max_tokens, + temperature=settings.retrieval.llm_temperature, + ) + result = RetrievalResult( + answer=answer, + context_windows=context_windows, + entity_matches=entity_matches, + relation_matches=relation_matches, + chunk_matches=chunk_matches, + ) + if settings.logging.qa_enabled: + write_query_log( + project_paths=self._project_paths, + retriever_name="pathrag", + payload={ + "question": question, + "answer": answer, + "active_documents_root": str(self._project_paths.documents_root) + if self._project_paths + else None, + "conversation_history": conversation_history or [], + "retrieval_metadata": { + "response_type": settings.retrieval.response_type, + "entity_top_k": settings.retrieval.entity_top_k, + "relation_top_k": settings.retrieval.relation_top_k, + "chunk_top_k": settings.retrieval.chunk_top_k, + "global_window_count": len(global_windows), + "local_window_count": len(local_windows), + }, + "model": _retrieval_model_metadata(), + "context_windows": [ + {"label": window.label, "text": window.text, "score": window.score} + for window in context_windows + ], + "retrieved_entities": [ + { + "name": match.name, + "type": match.type, + "description": match.description, + "score": match.score, + } + for match in entity_matches + ], + "retrieved_relationships": [ + { + "source_name": match.source_name, + "target_name": match.target_name, + "description": match.description, + "keywords": match.keywords, + "score": match.score, + } + for match in relation_matches + ], + "retrieved_chunks": [ + { + "chunk_uuid": match.chunk_uuid, + "document_id": match.document_id, + "filename": match.filename, + "text": match.text, + "score": match.score, + } + for match in chunk_matches + ], + }, + ) + return result def retrieve( self, @@ -969,4 +1004,4 @@ def retrieve( "ChunkMatch", "ContextWindow", "RetrievalResult", -] \ No newline at end of file +] diff --git a/graph/project_paths.py b/graph/project_paths.py new file mode 100644 index 0000000..b051e0b --- /dev/null +++ b/graph/project_paths.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, List, Optional, Union + +from settings import VALID_EXTENSIONS, StoragePaths, settings + + +@dataclass(frozen=True) +class ProjectPaths: + documents_root: Path + project_root: Path + storage_root: Path + knowledge_graph_dir: Path + graph_pickle_file: Path + retrieval_graph_pickle_file: Path + logs_dir: Path + qa_logs_dir: Path + audits_dir: Path + extraction_audits_dir: Path + storage: StoragePaths + ingestion_log_file: Path + pathrag_log_file: Path + lightrag_log_file: Path + + +def resolve_project_paths(documents_root: Union[Path, str]) -> ProjectPaths: + root = Path(documents_root).expanduser().resolve() + project_root = root / settings.project.artifacts_dirname + storage_root = project_root / settings.project.storage_dirname + knowledge_graph_dir = project_root / "knowledge_graph" + logs_dir = project_root / settings.project.logs_dirname + qa_logs_dir = logs_dir / settings.project.qa_logs_dirname + audits_dir = project_root / settings.project.audits_dirname + extraction_audits_dir = audits_dir / settings.project.extraction_audits_dirname + + storage = StoragePaths( + documents_db=str(storage_root / "documents.sqlite"), + chunks_db=str(storage_root / "chunks.sqlite"), + graph_db=str(storage_root / "graph.sqlite"), + chroma_chunks=str(storage_root / "chroma_chunks"), + chroma_entities=str(storage_root / "chroma_entities"), + chroma_relations=str(storage_root / "chroma_relations"), + ) + + return ProjectPaths( + documents_root=root, + project_root=project_root, + storage_root=storage_root, + knowledge_graph_dir=knowledge_graph_dir, + graph_pickle_file=knowledge_graph_dir / "kg.pkl", + retrieval_graph_pickle_file=knowledge_graph_dir / "kg_retrieval.pkl", + logs_dir=logs_dir, + qa_logs_dir=qa_logs_dir, + audits_dir=audits_dir, + extraction_audits_dir=extraction_audits_dir, + storage=storage, + ingestion_log_file=logs_dir / "ingestion.log", + pathrag_log_file=logs_dir / "pathrag.log", + lightrag_log_file=logs_dir / "lightrag.log", + ) + + +def ensure_project_dirs(project_paths: ProjectPaths) -> None: + for path in ( + project_paths.project_root, + project_paths.storage_root, + project_paths.knowledge_graph_dir, + project_paths.logs_dir, + project_paths.qa_logs_dir, + project_paths.audits_dir, + project_paths.extraction_audits_dir, + ): + path.mkdir(parents=True, exist_ok=True) + + +def list_document_paths( + documents_root: Union[Path, str], + *, + valid_extensions: Optional[Iterable[str]] = None, +) -> List[Path]: + root = Path(documents_root).expanduser().resolve() + if not root.exists() or not root.is_dir(): + return [] + + allowed = {ext.lower() for ext in (valid_extensions or VALID_EXTENSIONS)} + project_root = root / settings.project.artifacts_dirname + paths: List[Path] = [] + + for path in root.rglob("*"): + if not path.is_file(): + continue + if project_root in path.parents: + continue + if path.suffix.lower() not in allowed: + continue + paths.append(path) + + return sorted(paths, key=lambda item: str(item).lower()) diff --git a/graph/prompts.py b/graph/prompts.py index 1dc06d0..94c049b 100644 --- a/graph/prompts.py +++ b/graph/prompts.py @@ -211,6 +211,51 @@ Answer ONLY by `YES` OR `NO` if there are still entities that need to be added. """.strip() +PROMPTS["entity_extraction_audit"] = """---Role--- + +You are auditing an information extraction result for completeness. + +---Goal--- + +Review the source text and the initial extraction. Report possible missing entities or relationships, but do not rewrite or replace the original extraction. +Use {language} for any natural-language explanations. + +---Instructions--- + +- Focus only on entities of these types: [{entity_types}] +- Review the initial extraction carefully before reporting gaps. +- Only report candidates that appear to be missing from the original extraction. +- If nothing important appears missing, return empty arrays. +- Output valid JSON only. + +---Source Text--- +{input_text} + +---Initial Extraction--- +{initial_extraction} + +---Required JSON Schema--- +{{ + "missing_entities": [ + {{ + "name": "entity name", + "type": "entity type", + "reason": "why this looks missing" + }} + ], + "missing_relationships": [ + {{ + "source_name": "source entity", + "target_name": "target entity", + "reason": "why this relationship looks missing" + }} + ], + "summary": "short audit summary" +}} + +JSON: +""" + PROMPTS["fail_response"] = ( "Sorry, I'm not able to provide an answer to that question.[no-context]" ) @@ -383,6 +428,9 @@ Response:""" +PROMPTS["lightrag_response"] = PROMPTS["rag_response"] +PROMPTS["rag_response_naive"] = PROMPTS["naive_rag_response"] + PROMPTS["summarize_text"] = """---Role--- You are a helpful assistant responsible for generating a concise summary of the data provided below. Given a text document, please provide a concise summary that captures the main points and key information. @@ -426,4 +474,4 @@ hint_prompt = entity_extract_prompt.format( **{**context_base, "input_text": content} ) - print(hint_prompt) \ No newline at end of file + print(hint_prompt) diff --git a/graph/query_logging.py b/graph/query_logging.py new file mode 100644 index 0000000..d0975ad --- /dev/null +++ b/graph/query_logging.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, Optional +from uuid import uuid4 + +from project_paths import ProjectPaths, ensure_project_dirs + + +def _json_safe(value: Any) -> Any: + if value is None or isinstance(value, (str, int, float, bool)): + return value + if isinstance(value, dict): + return {str(k): _json_safe(v) for k, v in value.items()} + if isinstance(value, (list, tuple, set)): + return [_json_safe(v) for v in value] + return str(value) + + +def write_query_log( + *, + project_paths: Optional[ProjectPaths], + retriever_name: str, + payload: Dict[str, Any], +) -> Optional[Path]: + if project_paths is None: + return None + + ensure_project_dirs(project_paths) + timestamp = datetime.now(timezone.utc) + filename = f"{timestamp.strftime('%Y%m%dT%H%M%S.%fZ')}_{retriever_name}_{uuid4().hex[:8]}.json" + target = project_paths.qa_logs_dir / filename + + body = { + "retriever": retriever_name, + "timestamp": timestamp.isoformat(), + **_json_safe(payload), + } + target.write_text(json.dumps(body, ensure_ascii=False, indent=2), encoding="utf-8") + return target diff --git a/graph/settings.py b/graph/settings.py index 4d8637a..aefb83c 100644 --- a/graph/settings.py +++ b/graph/settings.py @@ -1,330 +1,361 @@ -# graph/settings.py - -from __future__ import annotations - -from dataclasses import dataclass -from typing import List, Optional, Literal -from dotenv import load_dotenv -import utils as ut - - -VALID_EXTENSIONS: List[str] = [".pdf", ".docx", ".txt", ".md", ".html"] - -# ───────────────────────────────────────────────────────────── -# Settings sections -# ───────────────────────────────────────────────────────────── - -@dataclass(frozen=True) -class ProviderSettings: - """ - Provider selection + credentials. - Choose provider via: LLM_PROVIDER = "openai" or "azure" - """ - provider: Literal["openai", "azure"] = "openai" - - # OpenAI (direct) - openai_api_key: Optional[str] = None - openai_base_url: Optional[str] = None # optional (for proxies / compatible servers) - openai_llm_model: Optional[str] = None # e.g. gpt-4o-mini - openai_embeddings_model: Optional[str] = None # e.g. text-embedding-3-small - - # Azure OpenAI - azure_api_key: Optional[str] = None - azure_endpoint: Optional[str] = None - azure_api_version: str = "2024-02-15-preview" - azure_llm_deployment: Optional[str] = None - azure_embeddings_deployment: Optional[str] = None - - -@dataclass(frozen=True) -class ChatGenerationSettings: - """ - Default knobs for chat completions used across the app. - """ - temperature: float = 0.0 - max_tokens: int = 2048 - -@dataclass(frozen=True) -class LLMPerformanceSettings: - """ - Performance-related knobs for LLM calls. - """ - max_concurrency: int = 6 # num of parallel requests - cache_enabled: bool = True - cache_max_age_hours: int = 720 # 30 days - -@dataclass(frozen=True) -class EmbeddingSettings: - """ - Embeddings model + batch behavior for vectorization. - (Model names come from ProviderSettings; this holds cross-cutting knobs.) - """ - batch_size: int = 64 - -@dataclass(frozen=True) -class PromptFormattingSettings: - """ - Formatting conventions your prompts expect & rely on. - Used by extractor prompts and downstream parsers. - """ - default_language: str = "English" - tuple_delimiter: str = "<|>" - record_delimiter: str = "##" - completion_delimiter: str = "<|COMPLETE|>" - default_entity_types: List[str] = None # filled in loader - -@dataclass(frozen=True) -class IngestionMergeSettings: - """ - How we concatenate multi-source fields before optional summarization. - delimiter: separates repeated descriptions/keywords/source_ids/filepaths - description_segment_limit: threshold after which we summarize with LLM - """ - delimiter: str = "||" - description_segment_limit: int = 5 - -@dataclass(frozen=True) -class ChunkingSettings: - """ - Default chunking policy for page-aware, sentence-preserving chunker. - """ - max_chars: int = 1200 - overlap_chars: int = 200 - include_overlap_in_limit: bool = True - join_with: str = " " - -@dataclass(frozen=True) -class StoragePaths: - """ - Where we store SQLite DBs and Chroma collections. - """ - documents_db: str = "./storage/documents.sqlite" - chunks_db: str = "./storage/chunks.sqlite" - graph_db: str = "./storage/graph.sqlite" - - chroma_chunks: str = "./storage/chroma_chunks" - chroma_entities: str = "./storage/chroma_entities" - chroma_relations: str = "./storage/chroma_relations" - -@dataclass(frozen=True) -class RetrievalSettings: - """ - Settings for retrieval operations (e.g. how many results to return). - """ - entity_top_k: int = 5 - relation_top_k: int = 5 - chunk_top_k: int = 6 - graph_depth: int = 2 - graph_windows: int = 3 - chunk_windows: int = 3 - graph_window_tokens: int = 512 - chunk_window_tokens: int = 512 - tiktoken_model: str = "gpt-4o-mini" # for token counting - llm_max_tokens: int = 512 - llm_temperature: float = 0.0 - history_turns: int = 4 - - # --- Hybrid/global (i.e. do we use paths & relations?) --- - hybrid_use_paths_for_global: bool = True # whether to build global context from paths - hybrid_use_relations_for_global: bool = False # whether to build global context from relations - global_max_windows: int = 4 # max global context windows - global_window_tokens: int = 512 # token cap per global window - - # --- Local toggles (i.e. do we use chunks and local neighborhoods?) --- - use_local_chunks: bool = True # whether to use local chunks - use_local_graph: bool = True # whether to use local graph - local_max_windows: int = 6 # cap total local windows - - # --- PathRAG-specific retrieval settings --- - path_use_top_entities: int = 5 # limit the number of entity seeds considered - path_max_depth: int = 3 # search up to 3 hops - path_threshold: float = 0.3 # propagation threshold - path_alpha: float = 0.8 # propagation decay - path_max_windows: int = 5 # how many path windows to emit - path_window_tokens: int = 512 # tokens per path window - - # --- LightRAG-specific retrieval settings --- - light_mode: str = "mix" # 'local', 'global', 'hybrid', 'mix', 'naive' - response_type: str = "Single Paragraph" #'Multiple Paragraphs', 'Single Paragraph' - rerank_top_k: int = 20 - enable_rerank: bool = True - rerank_cache_dir: str = "./flashrank_model" # directory for FlashRank model cache - rerank_model_name: str = "ms-marco-MultiBERT-L-12" # model name for reranking - truncate_chunks: bool = False # whether to truncate chunks by token limit - - -@dataclass(frozen=True) -class Settings: - """ - Full application settings bundle. - """ - provider: ProviderSettings - chat: ChatGenerationSettings - llmperf: LLMPerformanceSettings - embeddings: EmbeddingSettings - prompts: PromptFormattingSettings - ingestion: IngestionMergeSettings - chunking: ChunkingSettings - storage: StoragePaths - retrieval: RetrievalSettings - - - -# ───────────────────────────────────────────────────────────── -# Loader / validator -# ───────────────────────────────────────────────────────────── - -def load_settings() -> Settings: - """ - Loads and validates all application settings from environment variables. - - Reads from .env file, parses configuration for provider, LLM, embeddings, chunking, - storage paths, and retrieval settings. Validates required fields based on provider. - - Returns: - Settings: A fully configured Settings object. - - Raises: - RuntimeError: If required environment variables are missing or invalid. - """ - load_dotenv() # called once at startup - - # Provider selection - provider_name = (ut.env_str("LLM_PROVIDER", "openai") or "openai").strip().lower() - if provider_name not in {"openai", "azure"}: - raise RuntimeError("LLM_PROVIDER must be 'openai' or 'azure'.") - - provider = ProviderSettings( - provider=provider_name, # type: ignore[arg-type] - # OpenAI - openai_api_key=ut.env_str("OPENAI_API_KEY"), - openai_base_url=ut.env_str("OPENAI_BASE_URL"), - openai_llm_model=ut.env_str("OPENAI_LLM_MODEL"), - openai_embeddings_model=ut.env_str("OPENAI_EMBEDDINGS_MODEL"), - # Azure - azure_api_key=ut.env_str("AZURE_OPENAI_API_KEY"), - azure_endpoint=ut.env_str("AZURE_OPENAI_ENDPOINT"), - azure_api_version=ut.env_str("AZURE_OPENAI_API_VERSION", "2024-02-15-preview") or "2024-02-15-preview", - azure_llm_deployment=ut.env_str("AZURE_OPENAI_LLM_DEPLOYMENT_NAME"), - azure_embeddings_deployment=ut.env_str("AZURE_OPENAI_EMB_DEPLOYMENT_NAME"), - ) - - # Validate provider-specific required fields - if provider.provider == "openai": - if not provider.openai_api_key: - raise RuntimeError("OPENAI_API_KEY is required when LLM_PROVIDER=openai.") - if not provider.openai_llm_model: - raise RuntimeError("OPENAI_LLM_MODEL is required when LLM_PROVIDER=openai.") - if not provider.openai_embeddings_model: - raise RuntimeError("OPENAI_EMBEDDINGS_MODEL is required when LLM_PROVIDER=openai.") - else: - # azure - missing = [] - if not provider.azure_api_key: missing.append("AZURE_OPENAI_API_KEY") - if not provider.azure_endpoint: missing.append("AZURE_OPENAI_ENDPOINT") - if not provider.azure_llm_deployment: missing.append("AZURE_OPENAI_LLM_DEPLOYMENT_NAME") - if not provider.azure_embeddings_deployment: missing.append("AZURE_OPENAI_EMB_DEPLOYMENT_NAME") - if missing: - raise RuntimeError(f"When LLM_PROVIDER=azure, set required variables: {', '.join(missing)}") - - chat = ChatGenerationSettings( - temperature=ut.env_float("CHAT_TEMPERATURE", 0.0), - max_tokens=ut.env_int("CHAT_MAX_TOKENS", 2048), - ) - - llmperf = LLMPerformanceSettings( - max_concurrency=ut.env_int("LLM_MAX_CONCURRENCY", 6), - cache_enabled=ut.env_bool("LLM_CACHE_ENABLED", True), - cache_max_age_hours=ut.env_int("LLM_CACHE_MAX_AGE_HOURS", 720), - ) - - embeddings = EmbeddingSettings( - batch_size=ut.env_int("EMBEDDING_BATCH_SIZE", 64), - ) - - prompts = PromptFormattingSettings( - default_language=ut.env_str("PROMPT_DEFAULT_LANGUAGE", "English") or "English", - tuple_delimiter=ut.env_str("PROMPT_TUPLE_DELIMITER", "<|>") or "<|>", - record_delimiter=ut.env_str("PROMPT_RECORD_DELIMITER", "##") or "##", - completion_delimiter=ut.env_str("PROMPT_COMPLETION_DELIMITER", "<|COMPLETE|>") or "<|COMPLETE|>", - default_entity_types=ut.env_list( - "PROMPT_DEFAULT_ENTITY_TYPES", - "organization,person,geo,event,category" - ), - ) - - ingestion = IngestionMergeSettings( - delimiter=ut.env_str("MERGE_DELIMITER", "||") or "||", - description_segment_limit=ut.env_int("DESCRIPTION_SEGMENT_LIMIT", 5), - ) - - chunking = ChunkingSettings( - max_chars=ut.env_int("CHUNK_MAX_CHARS", 1200), - overlap_chars=ut.env_int("CHUNK_OVERLAP_CHARS", 200), - include_overlap_in_limit=ut.env_bool("CHUNK_INCLUDE_OVERLAP_IN_LIMIT", True), - join_with=ut.env_str("CHUNK_JOIN_WITH", " ") or " ", - ) - - storage = StoragePaths( - documents_db=ut.env_str("DOCUMENTS_DB_PATH", "./storage/documents.sqlite") or "./storage/documents.sqlite", - chunks_db=ut.env_str("CHUNKS_DB_PATH", "./storage/chunks.sqlite") or "./storage/chunks.sqlite", - graph_db=ut.env_str("GRAPH_DB_PATH", "./storage/graph.sqlite") or "./storage/graph.sqlite", - chroma_chunks=ut.env_str("CHROMA_CHUNKS_PATH", "./storage/chroma_chunks") or "./storage/chroma_chunks", - chroma_entities=ut.env_str("CHROMA_ENTITIES_PATH", "./storage/chroma_entities") or "./storage/chroma_entities", - chroma_relations=ut.env_str("CHROMA_RELATIONS_PATH", "./storage/chroma_relations") or "./storage/chroma_relations", - ) - - retrieval = RetrievalSettings( - entity_top_k=ut.env_int("RETRIEVAL_ENTITY_TOP_K", 5), - relation_top_k=ut.env_int("RETRIEVAL_RELATION_TOP_K", 5), - chunk_top_k=ut.env_int("RETRIEVAL_CHUNK_TOP_K", 6), - graph_depth=ut.env_int("RETRIEVAL_GRAPH_DEPTH", 2), - graph_windows=ut.env_int("RETRIEVAL_GRAPH_WINDOWS", 3), - chunk_windows=ut.env_int("RETRIEVAL_CHUNK_WINDOWS", 3), - graph_window_tokens=ut.env_int("RETRIEVAL_GRAPH_WINDOW_TOKENS", 512), - chunk_window_tokens=ut.env_int("RETRIEVAL_CHUNK_WINDOW_TOKENS", 512), - tiktoken_model = ut.env_str("RETRIEVAL_TIKTOKEN_MODEL", "gpt-4o-mini") or "gpt-4o-mini", - llm_max_tokens = ut.env_int("RETRIEVAL_LLM_MAX_TOKENS", 512), - llm_temperature = ut.env_float("RETRIEVAL_LLM_TEMPERATURE", 0.0), - history_turns= ut.env_int("RETRIEVAL_HISTORY_TURNS", 4), - # Hybrid/global - hybrid_use_paths_for_global = ut.env_bool("RETRIEVAL_HYBRID_USE_PATHS_FOR_GLOBAL", True), - hybrid_use_relations_for_global = ut.env_bool("RETRIEVAL_HYBRID_USE_RELATIONS_FOR_GLOBAL", False), - global_max_windows = ut.env_int("RETRIEVAL_GLOBAL_MAX_WINDOWS", 4), - global_window_tokens = ut.env_int("RETRIEVAL_GLOBAL_WINDOW_TOKENS", 512), - # Local - use_local_chunks = ut.env_bool("RETRIEVAL_USE_LOCAL_CHUNKS", True), - use_local_graph = ut.env_bool("RETRIEVAL_USE_LOCAL_GRAPH", True), - local_max_windows = ut.env_int("RETRIEVAL_LOCAL_MAX_WINDOWS", 6), - # PathRAG-specific - path_use_top_entities = ut.env_int("RETRIEVAL_PATH_USE_TOP_ENTITIES", 5), - path_max_depth = ut.env_int("RETRIEVAL_PATH_MAX_DEPTH", 3), - path_threshold = ut.env_float("RETRIEVAL_PATH_THRESHOLD", 0.3), - path_alpha = ut.env_float("RETRIEVAL_PATH_ALPHA", 0.8), - path_max_windows = ut.env_int("RETRIEVAL_PATH_MAX_WINDOWS", 5), - path_window_tokens = ut.env_int("RETRIEVAL_PATH_WINDOW_TOKENS", 512), - # LightRAG-specific - light_mode = ut.env_str("RETRIEVAL_LIGHT_MODE", "mix"), - response_type = ut.env_str("RETRIEVAL_RESPONSE_TYPE", "Single Paragraphs"), - enable_rerank = ut.env_bool("RETRIEVAL_ENABLE_RERANK", True), - rerank_top_k = ut.env_int("RETRIEVAL_RERANK_TOP_K", 20), - rerank_cache_dir = ut.env_str("RETRIEVAL_RERANK_CACHE_DIR", "./flashrank_model"), - rerank_model_name = ut.env_str("RETRIEVAL_RERANK_MODEL_NAME", "ms-marco-MultiBERT-L-12"), - truncate_chunks = ut.env_bool("RETRIEVAL_TRUNCATE_CHUNKS", False), - ) - - return Settings( - provider=provider, - chat=chat, - llmperf=llmperf, - embeddings=embeddings, - prompts=prompts, - ingestion=ingestion, - chunking=chunking, - storage=storage, - retrieval=retrieval, - ) - - -# Optional: convenience singleton (you can prefer dependency injection instead) -settings = load_settings() +from __future__ import annotations + +from dataclasses import dataclass +from typing import List, Literal, Optional + +from dotenv import load_dotenv + +import utils as ut + + +VALID_EXTENSIONS: List[str] = [".pdf", ".docx", ".txt", ".md", ".html"] + + +@dataclass(frozen=True) +class ProviderSettings: + provider: Literal["openai", "azure"] = "openai" + openai_api_key: Optional[str] = None + openai_base_url: Optional[str] = None + openai_llm_model: Optional[str] = None + openai_embeddings_model: Optional[str] = None + azure_api_key: Optional[str] = None + azure_endpoint: Optional[str] = None + azure_api_version: str = "2024-02-15-preview" + azure_llm_deployment: Optional[str] = None + azure_embeddings_deployment: Optional[str] = None + + +@dataclass(frozen=True) +class ChatGenerationSettings: + temperature: float = 0.0 + completion_max_tokens: int = 2048 + + @property + def max_tokens(self) -> int: + return self.completion_max_tokens + + +@dataclass(frozen=True) +class LLMPerformanceSettings: + max_concurrency: int = 6 + cache_enabled: bool = True + cache_max_age_hours: int = 720 + + +@dataclass(frozen=True) +class EmbeddingSettings: + batch_size: int = 64 + + +@dataclass(frozen=True) +class PromptFormattingSettings: + default_language: str = "English" + tuple_delimiter: str = "<|>" + record_delimiter: str = "##" + completion_delimiter: str = "<|COMPLETE|>" + default_entity_types: List[str] = None # type: ignore[assignment] + default_user_prompt: str = "n/a" + + +@dataclass(frozen=True) +class IngestionMergeSettings: + delimiter: str = "||" + description_segment_limit: int = 5 + + +@dataclass(frozen=True) +class ChunkingSettings: + max_chars: int = 1200 + overlap_chars: int = 200 + include_overlap_in_limit: bool = True + join_with: str = " " + + +@dataclass(frozen=True) +class StoragePaths: + documents_db: str = "./storage/documents.sqlite" + chunks_db: str = "./storage/chunks.sqlite" + graph_db: str = "./storage/graph.sqlite" + chroma_chunks: str = "./storage/chroma_chunks" + chroma_entities: str = "./storage/chroma_entities" + chroma_relations: str = "./storage/chroma_relations" + + +@dataclass(frozen=True) +class ProjectSettings: + artifacts_dirname: str = ".appl-kgraph" + storage_dirname: str = "storage" + logs_dirname: str = "logs" + qa_logs_dirname: str = "qa" + audits_dirname: str = "audits" + extraction_audits_dirname: str = "extraction" + + +@dataclass(frozen=True) +class ExtractionSettings: + use_chunk_language: bool = True + detect_chunk_language: bool = False + audit_second_pass_enabled: bool = False + + +@dataclass(frozen=True) +class LoggingSettings: + ingestion_enabled: bool = True + ingestion_level: str = "INFO" + retrieval_enabled: bool = True + retrieval_level: str = "INFO" + qa_enabled: bool = True + + +@dataclass(frozen=True) +class RetrievalSettings: + entity_top_k: int = 5 + relation_top_k: int = 5 + chunk_top_k: int = 6 + graph_depth: int = 2 + graph_windows: int = 3 + chunk_windows: int = 3 + graph_window_tokens: int = 512 + chunk_window_tokens: int = 512 + tiktoken_model: str = "gpt-4o-mini" + answer_max_tokens: int = 512 + llm_temperature: float = 0.0 + history_turns: int = 4 + hybrid_use_paths_for_global: bool = True + hybrid_use_relations_for_global: bool = False + global_max_windows: int = 4 + global_window_tokens: int = 512 + use_local_chunks: bool = True + use_local_graph: bool = True + local_max_windows: int = 6 + path_use_top_entities: int = 5 + path_max_depth: int = 3 + path_threshold: float = 0.3 + path_alpha: float = 0.8 + path_max_windows: int = 5 + path_window_tokens: int = 512 + light_mode: str = "mix" + response_type: str = "Single Paragraph" + rerank_top_k: int = 20 + enable_rerank: bool = True + rerank_cache_dir: str = "./flashrank_model" + rerank_model_name: str = "ms-marco-MultiBERT-L-12" + truncate_chunks: bool = False + + @property + def llm_max_tokens(self) -> int: + return self.answer_max_tokens + + +@dataclass(frozen=True) +class Settings: + provider: ProviderSettings + chat: ChatGenerationSettings + llmperf: LLMPerformanceSettings + embeddings: EmbeddingSettings + prompts: PromptFormattingSettings + ingestion: IngestionMergeSettings + chunking: ChunkingSettings + storage: StoragePaths + project: ProjectSettings + extraction: ExtractionSettings + logging: LoggingSettings + retrieval: RetrievalSettings + + +def load_settings() -> Settings: + load_dotenv() + + provider_name = (ut.env_str("LLM_PROVIDER", "openai") or "openai").strip().lower() + if provider_name not in {"openai", "azure"}: + raise RuntimeError("LLM_PROVIDER must be 'openai' or 'azure'.") + + provider = ProviderSettings( + provider=provider_name, # type: ignore[arg-type] + openai_api_key=ut.env_str("OPENAI_API_KEY"), + openai_base_url=ut.env_str("OPENAI_BASE_URL"), + openai_llm_model=ut.env_str("OPENAI_LLM_MODEL"), + openai_embeddings_model=ut.env_str("OPENAI_EMBEDDINGS_MODEL"), + azure_api_key=ut.env_str("AZURE_OPENAI_API_KEY"), + azure_endpoint=ut.env_str("AZURE_OPENAI_ENDPOINT"), + azure_api_version=ut.env_str("AZURE_OPENAI_API_VERSION", "2024-02-15-preview") + or "2024-02-15-preview", + azure_llm_deployment=ut.env_str("AZURE_OPENAI_LLM_DEPLOYMENT_NAME"), + azure_embeddings_deployment=ut.env_str("AZURE_OPENAI_EMB_DEPLOYMENT_NAME"), + ) + + if provider.provider == "openai": + if not provider.openai_api_key: + raise RuntimeError("OPENAI_API_KEY is required when LLM_PROVIDER=openai.") + if not provider.openai_llm_model: + raise RuntimeError("OPENAI_LLM_MODEL is required when LLM_PROVIDER=openai.") + if not provider.openai_embeddings_model: + raise RuntimeError("OPENAI_EMBEDDINGS_MODEL is required when LLM_PROVIDER=openai.") + else: + missing = [] + if not provider.azure_api_key: + missing.append("AZURE_OPENAI_API_KEY") + if not provider.azure_endpoint: + missing.append("AZURE_OPENAI_ENDPOINT") + if not provider.azure_llm_deployment: + missing.append("AZURE_OPENAI_LLM_DEPLOYMENT_NAME") + if not provider.azure_embeddings_deployment: + missing.append("AZURE_OPENAI_EMB_DEPLOYMENT_NAME") + if missing: + raise RuntimeError( + f"When LLM_PROVIDER=azure, set required variables: {', '.join(missing)}" + ) + + chat = ChatGenerationSettings( + temperature=ut.env_float("CHAT_TEMPERATURE", 0.0), + completion_max_tokens=ut.env_int("CHAT_MAX_TOKENS", 2048), + ) + + llmperf = LLMPerformanceSettings( + max_concurrency=ut.env_int("LLM_MAX_CONCURRENCY", 6), + cache_enabled=ut.env_bool("LLM_CACHE_ENABLED", True), + cache_max_age_hours=ut.env_int("LLM_CACHE_MAX_AGE_HOURS", 720), + ) + + embeddings = EmbeddingSettings( + batch_size=ut.env_int("EMBEDDING_BATCH_SIZE", 64), + ) + + prompts = PromptFormattingSettings( + default_language=ut.env_str("PROMPT_DEFAULT_LANGUAGE", "English") or "English", + tuple_delimiter=ut.env_str("PROMPT_TUPLE_DELIMITER", "<|>") or "<|>", + record_delimiter=ut.env_str("PROMPT_RECORD_DELIMITER", "##") or "##", + completion_delimiter=ut.env_str("PROMPT_COMPLETION_DELIMITER", "<|COMPLETE|>") + or "<|COMPLETE|>", + default_entity_types=ut.env_list( + "PROMPT_DEFAULT_ENTITY_TYPES", + "organization,person,geo,event,category", + ), + default_user_prompt=ut.env_str("PROMPT_DEFAULT_USER_PROMPT", "n/a") or "n/a", + ) + + ingestion = IngestionMergeSettings( + delimiter=ut.env_str("MERGE_DELIMITER", "||") or "||", + description_segment_limit=ut.env_int("DESCRIPTION_SEGMENT_LIMIT", 5), + ) + + chunking = ChunkingSettings( + max_chars=ut.env_int("CHUNK_MAX_CHARS", 1200), + overlap_chars=ut.env_int("CHUNK_OVERLAP_CHARS", 200), + include_overlap_in_limit=ut.env_bool("CHUNK_INCLUDE_OVERLAP_IN_LIMIT", True), + join_with=ut.env_str("CHUNK_JOIN_WITH", " ") or " ", + ) + + storage = StoragePaths( + documents_db=ut.env_str("DOCUMENTS_DB_PATH", "./storage/documents.sqlite") + or "./storage/documents.sqlite", + chunks_db=ut.env_str("CHUNKS_DB_PATH", "./storage/chunks.sqlite") + or "./storage/chunks.sqlite", + graph_db=ut.env_str("GRAPH_DB_PATH", "./storage/graph.sqlite") + or "./storage/graph.sqlite", + chroma_chunks=ut.env_str("CHROMA_CHUNKS_PATH", "./storage/chroma_chunks") + or "./storage/chroma_chunks", + chroma_entities=ut.env_str("CHROMA_ENTITIES_PATH", "./storage/chroma_entities") + or "./storage/chroma_entities", + chroma_relations=ut.env_str("CHROMA_RELATIONS_PATH", "./storage/chroma_relations") + or "./storage/chroma_relations", + ) + + project = ProjectSettings( + artifacts_dirname=ut.env_str("PROJECT_ARTIFACTS_DIRNAME", ".appl-kgraph") + or ".appl-kgraph", + storage_dirname=ut.env_str("PROJECT_STORAGE_DIRNAME", "storage") or "storage", + logs_dirname=ut.env_str("PROJECT_LOGS_DIRNAME", "logs") or "logs", + qa_logs_dirname=ut.env_str("PROJECT_QA_LOGS_DIRNAME", "qa") or "qa", + audits_dirname=ut.env_str("PROJECT_AUDITS_DIRNAME", "audits") or "audits", + extraction_audits_dirname=ut.env_str( + "PROJECT_EXTRACTION_AUDITS_DIRNAME", "extraction" + ) + or "extraction", + ) + + extraction = ExtractionSettings( + use_chunk_language=ut.env_bool("EXTRACTION_USE_CHUNK_LANGUAGE", True), + detect_chunk_language=ut.env_bool("EXTRACTION_DETECT_CHUNK_LANGUAGE", False), + audit_second_pass_enabled=ut.env_bool( + "EXTRACTION_AUDIT_SECOND_PASS_ENABLED", + False, + ), + ) + + logging_settings = LoggingSettings( + ingestion_enabled=ut.env_bool("INGESTION_LOG_ENABLED", True), + ingestion_level=ut.env_str("INGESTION_LOG_LEVEL", "INFO") or "INFO", + retrieval_enabled=ut.env_bool("RETRIEVAL_LOG_ENABLED", True), + retrieval_level=ut.env_str("RETRIEVAL_LOG_LEVEL", "INFO") or "INFO", + qa_enabled=ut.env_bool("QA_LOG_ENABLED", True), + ) + + retrieval = RetrievalSettings( + entity_top_k=ut.env_int("RETRIEVAL_ENTITY_TOP_K", 5), + relation_top_k=ut.env_int("RETRIEVAL_RELATION_TOP_K", 5), + chunk_top_k=ut.env_int("RETRIEVAL_CHUNK_TOP_K", 6), + graph_depth=ut.env_int("RETRIEVAL_GRAPH_DEPTH", 2), + graph_windows=ut.env_int("RETRIEVAL_GRAPH_WINDOWS", 3), + chunk_windows=ut.env_int("RETRIEVAL_CHUNK_WINDOWS", 3), + graph_window_tokens=ut.env_int("RETRIEVAL_GRAPH_WINDOW_TOKENS", 512), + chunk_window_tokens=ut.env_int("RETRIEVAL_CHUNK_WINDOW_TOKENS", 512), + tiktoken_model=ut.env_str("RETRIEVAL_TIKTOKEN_MODEL", "gpt-4o-mini") + or "gpt-4o-mini", + answer_max_tokens=ut.env_int("RETRIEVAL_LLM_MAX_TOKENS", 512), + llm_temperature=ut.env_float("RETRIEVAL_LLM_TEMPERATURE", 0.0), + history_turns=ut.env_int("RETRIEVAL_HISTORY_TURNS", 4), + hybrid_use_paths_for_global=ut.env_bool( + "RETRIEVAL_HYBRID_USE_PATHS_FOR_GLOBAL", + True, + ), + hybrid_use_relations_for_global=ut.env_bool( + "RETRIEVAL_HYBRID_USE_RELATIONS_FOR_GLOBAL", + False, + ), + global_max_windows=ut.env_int("RETRIEVAL_GLOBAL_MAX_WINDOWS", 4), + global_window_tokens=ut.env_int("RETRIEVAL_GLOBAL_WINDOW_TOKENS", 512), + use_local_chunks=ut.env_bool("RETRIEVAL_USE_LOCAL_CHUNKS", True), + use_local_graph=ut.env_bool("RETRIEVAL_USE_LOCAL_GRAPH", True), + local_max_windows=ut.env_int("RETRIEVAL_LOCAL_MAX_WINDOWS", 6), + path_use_top_entities=ut.env_int("RETRIEVAL_PATH_USE_TOP_ENTITIES", 5), + path_max_depth=ut.env_int("RETRIEVAL_PATH_MAX_DEPTH", 3), + path_threshold=ut.env_float("RETRIEVAL_PATH_THRESHOLD", 0.3), + path_alpha=ut.env_float("RETRIEVAL_PATH_ALPHA", 0.8), + path_max_windows=ut.env_int("RETRIEVAL_PATH_MAX_WINDOWS", 5), + path_window_tokens=ut.env_int("RETRIEVAL_PATH_WINDOW_TOKENS", 512), + light_mode=ut.env_str("RETRIEVAL_LIGHT_MODE", "mix") or "mix", + response_type=ut.env_str("RETRIEVAL_RESPONSE_TYPE", "Single Paragraph") + or "Single Paragraph", + enable_rerank=ut.env_bool("RETRIEVAL_ENABLE_RERANK", True), + rerank_top_k=ut.env_int("RETRIEVAL_RERANK_TOP_K", 20), + rerank_cache_dir=ut.env_str( + "RETRIEVAL_RERANK_CACHE_DIR", + "./flashrank_model", + ) + or "./flashrank_model", + rerank_model_name=ut.env_str( + "RETRIEVAL_RERANK_MODEL_NAME", + "ms-marco-MultiBERT-L-12", + ) + or "ms-marco-MultiBERT-L-12", + truncate_chunks=ut.env_bool("RETRIEVAL_TRUNCATE_CHUNKS", False), + ) + + return Settings( + provider=provider, + chat=chat, + llmperf=llmperf, + embeddings=embeddings, + prompts=prompts, + ingestion=ingestion, + chunking=chunking, + storage=storage, + project=project, + extraction=extraction, + logging=logging_settings, + retrieval=retrieval, + ) + + +settings = load_settings() diff --git a/graph/utils.py b/graph/utils.py index 0e519a1..5810969 100644 --- a/graph/utils.py +++ b/graph/utils.py @@ -1,80 +1,81 @@ +from __future__ import annotations + import os -from typing import List, Optional, Literal -from langdetect import detect, LangDetectException +from typing import List, Optional -def detect_language(text: str, num_chars: int = 1000) -> str: - """ - Detects the language of a text based on a sample of its characters. +from langdetect import LangDetectException, detect - Args: - text (str): The input text to analyze for language detection. - num_chars (int, optional): The number of characters from the beginning - of the text to use for detection. Defaults to 1000. - Returns: - str: A language code (e.g., 'en' for English, 'fr' for French) or 'unknown' - if the language cannot be detected or if the text is empty. +LANGUAGE_NAME_MAP = { + "ar": "Arabic", + "de": "German", + "en": "English", + "es": "Spanish", + "fr": "French", + "it": "Italian", + "nl": "Dutch", + "pt": "Portuguese", + "zh": "Chinese", + "zh-cn": "Chinese", + "zh-tw": "Chinese Traditional", +} + + +def detect_language(text: str, num_chars: int = 1000) -> str: + """ + Detect a language code from the leading portion of a text. """ text_snippet = text[:num_chars] if len(text) > num_chars else text if not text_snippet.strip(): - # Handle the case where the text snippet is empty or only contains whitespace - return 'unknown' + return "unknown" + try: return detect(text_snippet) - except LangDetectException as e: - if 'No features in text' in str(e): - # Handle the specific error where no features are found in the text - return 'unknown' - # Default return statement to ensure the function always returns a value - return 'unknown' + except LangDetectException as exc: + if "No features in text" in str(exc): + return "unknown" + return "unknown" -# ───────────────────────────────────────────────────────────── -# Small helpers to parse environment variables robustly -# ───────────────────────────────────────────────────────────── -def _strip_quotes(val: Optional[str]) -> Optional[str]: +def normalize_language_name(language: Optional[str], default: str = "English") -> str: + """ + Convert a language code or free-form language string into a prompt-friendly name. """ - Removes surrounding quotes from environment variable values. + if not language: + return default - Args: - val (Optional[str]): The value to process. + candidate = str(language).strip() + if not candidate: + return default - Returns: - Optional[str]: The value with quotes stripped, or None if input was None. - """ + lowered = candidate.lower() + if lowered == "unknown": + return default + if lowered in LANGUAGE_NAME_MAP: + return LANGUAGE_NAME_MAP[lowered] + if len(candidate) <= 3 and candidate.islower(): + return default + return candidate[:1].upper() + candidate[1:] + + +def _strip_quotes(val: Optional[str]) -> Optional[str]: if val is None: return None - v = val.strip() - if (v.startswith('"') and v.endswith('"')) or (v.startswith("'") and v.endswith("'")): - return v[1:-1] - return v - -def env_str(key: str, default: Optional[str] = None) -> Optional[str]: - """ - Reads a string environment variable with quote stripping. + stripped = val.strip() + if (stripped.startswith('"') and stripped.endswith('"')) or ( + stripped.startswith("'") and stripped.endswith("'") + ): + return stripped[1:-1] + return stripped - Args: - key (str): The environment variable name. - default (Optional[str], optional): Default value if not found. Defaults to None. - Returns: - Optional[str]: The environment variable value or default. - """ +def env_str(key: str, default: Optional[str] = None) -> Optional[str]: val = os.getenv(key) return _strip_quotes(val) if val is not None else default -def env_int(key: str, default: int) -> int: - """ - Reads an integer environment variable with fallback to default. - Args: - key (str): The environment variable name. - default (int): Default value if not found or invalid. - - Returns: - int: The parsed integer value or default. - """ +def env_int(key: str, default: int) -> int: val = env_str(key) if val is None or val == "": return default diff --git a/test/test_chunker.py b/test/test_chunker.py new file mode 100644 index 0000000..c4601fe --- /dev/null +++ b/test/test_chunker.py @@ -0,0 +1,43 @@ +import os +import sys +from pathlib import Path + + +os.environ.setdefault("OPENAI_API_KEY", "test-key") +os.environ.setdefault("OPENAI_LLM_MODEL", "test-model") +os.environ.setdefault("OPENAI_EMBEDDINGS_MODEL", "test-embed") + +sys.path.append(str(Path(__file__).resolve().parent.parent / "graph")) + +from graph.chunker import chunk_text + + +def test_chunk_overlap_does_not_cascade_previous_overlap(): + text = "A1. B2. C3. D4. E5. F6." + chunks = chunk_text( + text, + max_chars=11, + overlap_chars=7, + include_overlap_in_limit=True, + ) + + assert [chunk["text"] for chunk in chunks] == [ + "A1. B2. C3.", + "B2. C3. D4.", + "D4. E5. F6.", + ] + + +def test_chunker_keeps_oversized_sentence_intact(): + text = "This sentence is deliberately much longer than the configured chunk size." + chunks = chunk_text( + text, + max_chars=10, + overlap_chars=0, + include_overlap_in_limit=True, + ) + + assert len(chunks) == 1 + assert chunks[0]["text"] == text + assert chunks[0]["char_count"] > 10 + assert chunks[0]["exceeds_target"] is True diff --git a/test/test_graph_pickle.py b/test/test_graph_pickle.py new file mode 100644 index 0000000..97b1901 --- /dev/null +++ b/test/test_graph_pickle.py @@ -0,0 +1,114 @@ +import os +import sys +from pathlib import Path + +import networkx as nx + + +os.environ.setdefault("OPENAI_API_KEY", "test-key") +os.environ.setdefault("OPENAI_LLM_MODEL", "test-model") +os.environ.setdefault("OPENAI_EMBEDDINGS_MODEL", "test-embed") + +sys.path.append(str(Path(__file__).resolve().parent.parent / "graph")) + +from graph.graph_pickle import ( + load_graph_from_pickle, + load_or_build_graph_snapshot, + save_graph_to_pickle, +) + + +class _FakeCursor: + def __init__(self, rows): + self._rows = rows + + def fetchall(self): + return self._rows + + +class _FakeConnection: + def __init__(self, node_rows, edge_rows): + self._node_rows = node_rows + self._edge_rows = edge_rows + + def execute(self, sql): + if "FROM nodes" in sql: + return _FakeCursor(self._node_rows) + if "FROM edges" in sql: + return _FakeCursor(self._edge_rows) + raise AssertionError(f"Unexpected query: {sql}") + + +class _FakeGraphDB: + def __init__(self, node_rows, edge_rows): + self._node_rows = node_rows + self._edge_rows = edge_rows + + def connect(self): + connection = _FakeConnection(self._node_rows, self._edge_rows) + + class _Context: + def __enter__(self_inner): + return connection + + def __exit__(self_inner, exc_type, exc, tb): + return False + + return _Context() + + +class _FakeStorage: + def __init__(self, node_rows, edge_rows): + self.graphdb = _FakeGraphDB(node_rows, edge_rows) + + +def test_graph_pickle_round_trip(tmp_path): + graph = nx.Graph() + graph.add_node("A", type="person") + graph.add_edge("A", "B", weight=2.0) + + target = tmp_path / "graph.pkl" + save_graph_to_pickle(graph, target) + loaded = load_graph_from_pickle(target) + + assert isinstance(loaded, nx.Graph) + assert sorted(loaded.nodes()) == ["A", "B"] + assert loaded["A"]["B"]["weight"] == 2.0 + + +def test_load_or_build_graph_snapshot_rebuilds_and_saves_when_missing(tmp_path): + storage = _FakeStorage( + node_rows=[ + ("Node A", "person", "Alpha", "chunk-1||chunk-2", "doc-a.txt"), + ("Node B", "organization", "Beta", "", "doc-b.txt"), + ], + edge_rows=[ + ("Node A", "Node B", 1.5, "works with", "partnership", "chunk-1", "doc-a.txt"), + ], + ) + + target = tmp_path / "kg_retrieval.pkl" + graph = load_or_build_graph_snapshot(storage, snapshot_path=target) + + assert target.exists() + assert sorted(graph.nodes()) == ["Node A", "Node B"] + assert graph.nodes["Node A"]["chunk_uuids"] == ["chunk-1", "chunk-2"] + assert graph["Node A"]["Node B"]["keywords"] == "partnership" + + +def test_load_or_build_graph_snapshot_prefers_existing_pickle(tmp_path): + graph = nx.Graph() + graph.add_node("Saved Node", type="event") + target = tmp_path / "kg_retrieval.pkl" + save_graph_to_pickle(graph, target) + + class _BrokenStorage: + class _GraphDB: + def connect(self): + raise AssertionError("Storage should not be consulted when snapshot exists") + + graphdb = _GraphDB() + + loaded = load_or_build_graph_snapshot(_BrokenStorage(), snapshot_path=target) + + assert sorted(loaded.nodes()) == ["Saved Node"] diff --git a/test/test_pathrag_storage_adapter.py b/test/test_pathrag_storage_adapter.py index 365b116..aae2697 100644 --- a/test/test_pathrag_storage_adapter.py +++ b/test/test_pathrag_storage_adapter.py @@ -1,41 +1,15 @@ +import os import sys -import types +from pathlib import Path import pytest -_storage_module = types.ModuleType("storage") +os.environ.setdefault("OPENAI_API_KEY", "test-key") +os.environ.setdefault("OPENAI_LLM_MODEL", "test-model") +os.environ.setdefault("OPENAI_EMBEDDINGS_MODEL", "test-embed") - -class _ImportStubStorage: - def __init__(self, *args, **kwargs): - pass - - def init(self): # pragma: no cover - simple stub - return None - - -class _ImportStubPaths: - pass - - -_storage_module.Storage = _ImportStubStorage -_storage_module.StoragePaths = _ImportStubPaths -sys.modules.setdefault("storage", _storage_module) - -_llm_module = types.ModuleType("llm") - - -class _ImportStubChat: - def __init__(self, *args, **kwargs): - pass - - def generate(self, *args, **kwargs): # pragma: no cover - simple stub - return "" - - -_llm_module.Chat = _ImportStubChat -sys.modules.setdefault("llm", _llm_module) +sys.path.append(str(Path(__file__).resolve().parent.parent / "graph")) from graph.pathrag import StorageAdapter @@ -54,7 +28,7 @@ def __init__(self, responses): self.relation_vectors = _FakeVector(responses["relations"]) self.chunk_vectors = _FakeVector(responses["chunks"]) - def init(self): # pragma: no cover - simple stub + def init(self): return None @@ -103,7 +77,7 @@ def fake_storage(monkeypatch): ], } storage = _FakeStorage(responses) - monkeypatch.setattr("graph.pathrag.IngestionStorage", lambda *args, **kwargs: storage) + monkeypatch.setattr("graph.pathrag.Storage", lambda *args, **kwargs: storage) return storage @@ -111,14 +85,14 @@ def test_storage_adapter_query_helpers_expand_matches(fake_storage): adapter = StorageAdapter() entity_matches = adapter.query_entities("query", limit=5) - assert [m.name for m in entity_matches] == ["Entity One", "Entity Two"] - assert all(m.score > 0 for m in entity_matches) + assert [match.name for match in entity_matches] == ["Entity One", "Entity Two"] + assert all(match.score > 0 for match in entity_matches) relation_matches = adapter.query_relations("query", limit=5) - assert [m.source_name for m in relation_matches] == ["Entity One", "Entity Two"] - assert all(m.score > 0 for m in relation_matches) + assert [match.source_name for match in relation_matches] == ["Entity One", "Entity Two"] + assert all(match.score > 0 for match in relation_matches) chunk_matches = adapter.query_chunks("query", limit=5) - assert [m.chunk_uuid for m in chunk_matches] == ["chunk-1", "chunk-2"] - assert [m.text for m in chunk_matches] == ["text one", "text two"] - assert all(m.score > 0 for m in chunk_matches) + assert [match.chunk_uuid for match in chunk_matches] == ["chunk-1", "chunk-2"] + assert [match.text for match in chunk_matches] == ["text one", "text two"] + assert all(match.score > 0 for match in chunk_matches) diff --git a/test/test_project_paths.py b/test/test_project_paths.py new file mode 100644 index 0000000..447830b --- /dev/null +++ b/test/test_project_paths.py @@ -0,0 +1,41 @@ +import os +import sys +from pathlib import Path + + +os.environ.setdefault("OPENAI_API_KEY", "test-key") +os.environ.setdefault("OPENAI_LLM_MODEL", "test-model") +os.environ.setdefault("OPENAI_EMBEDDINGS_MODEL", "test-embed") + +sys.path.append(str(Path(__file__).resolve().parent.parent / "graph")) + +from graph.project_paths import list_document_paths, resolve_project_paths + + +def test_resolve_project_paths_nests_artifacts_under_documents_root(tmp_path): + documents_root = tmp_path / "docs" + documents_root.mkdir() + + project_paths = resolve_project_paths(documents_root) + + assert project_paths.documents_root == documents_root.resolve() + assert project_paths.project_root == documents_root.resolve() / ".appl-kgraph" + assert Path(project_paths.storage.documents_db).parent == project_paths.storage_root + assert project_paths.qa_logs_dir.parent == project_paths.logs_dir + assert project_paths.graph_pickle_file == project_paths.knowledge_graph_dir / "kg.pkl" + assert project_paths.retrieval_graph_pickle_file == project_paths.knowledge_graph_dir / "kg_retrieval.pkl" + + +def test_list_document_paths_excludes_project_artifacts(tmp_path): + documents_root = tmp_path / "docs" + documents_root.mkdir() + (documents_root / "a.txt").write_text("alpha", encoding="utf-8") + (documents_root / "b.md").write_text("beta", encoding="utf-8") + + project_root = documents_root / ".appl-kgraph" + project_root.mkdir() + (project_root / "ignored.txt").write_text("ignore me", encoding="utf-8") + + paths = list_document_paths(documents_root) + + assert [path.name for path in paths] == ["a.txt", "b.md"]