sologuy · sologuy · Dec 1, 2025 · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025
diff --git a/.github/workflows/build-exe.yml b/.github/workflows/build-exe.yml
@@ -0,0 +1,39 @@
+name: Build Executable App Across OSes
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: "*"  # "*" = last stable python version
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install pyinstaller
+
+    - name: Build executable
+      run: python build_app.py
+
+    - name: Upload artifacts
+      uses: actions/upload-artifact@v4
+      with:
+        name: BookmarkSummarizer-Binaries-${{ matrix.os }}
+        path: dist/
diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml
@@ -0,0 +1,76 @@
+# This workflow will install Python dependencies and run tests with a variety of Python versions
+# It uses the Python Package GitHub Actions workflow.
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+# and https://www.youtube.com/watch?v=l6fV09z5XHk
+
+name: Continuous integration for each commit and pull request
+
+on:
+  push:
+    branches:
+      - main # $default-branch only works in Workflows templates, not in Workflows, see https://stackoverflow.com/questions/64781462/github-actions-default-branch-variable
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["*"]  # check the list of versions: https://github.com/actions/python-versions/releases and https://github.com/actions/setup-python/blob/main/docs/advanced-usage.md -- note that "*" represents the latest stable version of Python
+        os: [ ubuntu-latest, windows-latest, macos-latest ] # jobs that run on Windows and macOS runners that GitHub hosts consume minutes at 2 and 10 times the rate that jobs on Linux runners consume respectively. But it's free for public OSS repositories.
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: 'pip'
+    # You can test your matrix by printing the current Python version
+    - name: Display Python version
+      run: |
+        python -c "import sys; print(sys.version)"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+    - name: Install this Python app
+      run: |
+        python -m pip install --upgrade --editable .[test] --verbose --use-pep517
+    - name: Test with pytest
+      run: |
+        #coverage run --branch -m pytest . -v  # Do NOT do that, because coverage is already run in pytest as specified in pyproject.toml, so this calls two nested instances of coverage, hence this will glitch out!
+        pytest -v  # run tests with coverage (as specified in pyproject.toml) and save the coverage as html and xml
+        coverage report -m
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v5
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}  # now required even for public repos, and also advised to avoid rate-limiting API by GitHub which makes the upload fails randomly: https://community.codecov.com/t/upload-issues-unable-to-locate-build-via-github-actions-api/3954/9 and https://github.com/codecov/codecov-action/issues/598
+        #directory: ./coverage/reports/
+        env_vars: OS,PYTHON
+        fail_ci_if_error: false
+        #files: ./coverage1.xml,./coverage2.xml
+        flags: unittests
+        name: codecov-umbrella
+        verbose: true
+    - name: Build sdist (necessary for the other tests below)
+      if: ${{ matrix.python-version == '*' }}
+      run: |
+        pip install --upgrade build
+        python -sBm build
+    - name: Twine check
+      if: ${{ matrix.python-version == '*' }}
+      run: |
+        pip install --upgrade twine
+        twine check "dist/*"
+    - name: pyproject.toml validity
+      if: ${{ matrix.python-version == '*' }}
+      run: |
+        pip install --upgrade validate-pyproject
+        validate-pyproject pyproject.toml -v
+    - name: Check for potential security issues
+      run: |
+        pip install --upgrade bandit
+        bandit -r . -x ./tests -lll
diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
diff --git a/.github/workflows/releases-ci-cd.yml b/.github/workflows/releases-ci-cd.yml
@@ -0,0 +1,123 @@
+# This workflow will test the module and then upload to PyPi, when triggered by the creation of a new GitHub Release
+# It uses the Python Package GitHub Actions workflow.
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+# and https://www.youtube.com/watch?v=l6fV09z5XHk
+# and https://py-pkgs.org/08-ci-cd#uploading-to-testpypi-and-pypi
+
+name: Releases test, coverage and upload to Test PyPi and PyPi
+
+# Build only on creation of new releases
+on:
+  # push:  # build on every commit push
+  # pull_request:  # build on every pull request
+  release:  # build on every releases
+    types:
+    - published  # use published, not released and prereleased, because prereleased is not triggered if created from a draft: https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#release
+  workflow_dispatch:
+
+jobs:
+  testbuild:
+    name: Unit test and building
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["*"]  # check the list of versions: https://github.com/actions/python-versions/releases and https://github.com/actions/setup-python/blob/main/docs/advanced-usage.md -- note that "*" represents the latest stable version of Python
+        os: [ ubuntu-latest, windows-latest, macos-latest ] # jobs that run on Windows and macOS runners that GitHub hosts consume minutes at 2 and 10 times the rate that jobs on Linux runners consume respectively. But it's free for public OSS repositories.
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: 'pip'
+    # You can test your matrix by printing the current Python version
+    - name: Display Python version
+      run: |
+        python -c "import sys; print(sys.version)"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        #python -m pip install pytest pytest-cov  # done in setup.cfg for Py2 or pyproject.toml for Py3
+        #if [ ${{ matrix.python-version }} <= 3.7 ]; then python -m pip install 'coverage<4'; else python -m pip install coverage; fi
+        #if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Install this module
+      #if: ${{ matrix.python-version >= 3 }}  # does not work on dynamic versions, see: https://github.com/actions/setup-python/issues/644
+      # Do not import testmeta, they make the build fails somehow, because some dependencies are unavailable on Py2
+      run: |
+        #python -m pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple --upgrade --editable .[test] --verbose --use-pep517
+        # Here we do NOT build against test.pypi.org but only the real pypi because we want to test before shipping whether users with a normal pypi version can install our package!
+        python -m pip install --upgrade --editable .[test] --verbose --use-pep517
+    - name: Test with pytest
+      run: |
+        #coverage run --branch -m pytest . -v  # Do NOT do that, because coverage is already run in pytest as specified in pyproject.toml, so this calls two nested instances of coverage, hence this will glitch out!
+        pytest -v
+        coverage report -m
+    - name: Build source distribution and wheel
+      run: |
+        python -m pip install --upgrade build
+        python -sBm build
+    - name: Save dist/ content for reuse in other GitHub Workflow blocks
+      if: matrix.os == 'ubuntu-latest'
+      uses: actions/upload-artifact@v4
+      with:
+        path: dist/*
+
+  upload_test_pypi:  # Upload to TestPyPi first to ensure that the release is OK (we will try to download it and install it afterwards), as recommended in https://py-pkgs.org/08-ci-cd#uploading-to-testpypi-and-pypi
+    name: Upload to TestPyPi
+    needs: [testbuild]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Unpack default artifact into dist/
+        uses: actions/download-artifact@v4
+        with:
+          # unpacks default artifact into dist/
+          # if `name: artifact` is omitted, the action will create extra parent dir
+          name: artifact
+          path: dist
+
+      - name: Upload to TestPyPi
+        uses: pypa/[email protected]
+        with:
+          user: __token__
+          password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+          repository_url: https://test.pypi.org/legacy/
+          # To test: repository_url: https://test.pypi.org/legacy/  # and also change token: ${{ secrets.PYPI_API_TOKEN }} to secrets.TEST_PYPI_API_TOKEN # for more infos on registering and using TestPyPi, read: https://py-pkgs.org/08-ci-cd#uploading-to-testpypi-and-pypi -- remove the repository_url to upload to the real PyPi
+
+      - name: Test install from TestPyPI
+        run: |
+          python -m pip install --upgrade pip
+          # First install dependencies from the real PyPI by installing the local package
+          # This avoids dependency confusion attacks (e.g. FASTAPI 1.0 on TestPyPI)
+          python -m pip install .
+          # Then uninstall the local package but keep dependencies
+          python -m pip uninstall bookmark-summarizer -y
+          # Finally install the package from TestPyPI without dependencies (since they are already installed)
+          python -m pip install \
+          --index-url https://test.pypi.org/simple/ \
+          --no-deps \
+          bookmark-summarizer
+
+  upload_pypi:  # Upload to the real PyPi if everything else worked before, as suggested in: https://py-pkgs.org/08-ci-cd#uploading-to-testpypi-and-pypi
+    name: Upload to the real PyPi
+    needs: [testbuild, upload_test_pypi]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          # unpacks default artifact into dist/
+          # if `name: artifact` is omitted, the action will create extra parent dir
+          name: artifact
+          path: dist
+
+      - uses: pypa/[email protected]
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI_API_TOKEN }}
+
+      - name: Test install from PyPI
+        run: |
+          python -m pip install --upgrade pip
+          pip uninstall bookmark-summarizer -y
+          pip install --upgrade bookmark-summarizer
diff --git a/.gitignore b/.gitignore
@@ -181,5 +181,38 @@ failed_urls*.json
 failed_urls.txt
 .env
 
+# 配置文件（包含API密钥，必须忽略）
+*.toml
+!default_config.toml
+!pyproject.toml
+
+# 数据库和索引文件
+*.lmdb
+*.lmdb/
+bookmark_index.lmdb/
+whoosh_index/
+
+# 备份目录
+backups/
+backup/
+*.backup
+
+# 临时和调试文件
+check_index.py
+debug_*.py
+test_*.py
+!tests/test_*.py
+
+# 日志文件
+crawl_errors.log
+*.log
+
+# IDE和编辑器
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
 # 不忽略的文件
 !requirements.txt
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,28 @@
+Apply these instructions in any language; translate into the appropriate language before responding.
+
+Before answering, consider what senior expert knowledge would best fit, then adopt the persona of the most relevant human expert for the question, and explicitly mention which expert you chose. For example, for relationship issues, become a couples therapist. You can combine personas if both are highly relevant.
+
+When you are asked to solve a problem but there is no straightforward solution, offer to be creative to find multiple innovative solutions.
+
+Be extremely detailed and comprehensive. Err on the side of including too much information rather than too little, unless the user has requested brevity. Provide background, logic, alternatives, implications, and expert context in your answers.
+
+Be honest, transparent, and thorough. Assume the user needs highly reliable, decision-critical information, so take the time to check for gaps, biases, or false assumptions.
+
+When the user asks for a solution, be innovative but pragmatic and mindful of minimizing algorithmic complexity, and you can suggest multiple alternatives if there is no obviously optimal solution that is well established for this type of problem.
+
+Always check whether it is impossible to achieve what the user wants to do. In this case, clearly state so, then adopt a creative persona, and offer multiple alterative solutions for the underlying problem, then ask the user which solution they would prefer.
+
+Always try to minimize the changes to the bare minimum. Avoid any unnecessary changes, except if they improve readability or functionality. For example, if changing a function's name would not improve either readability nor functionality, just keep it as it is.
+
+To achieve minimization, always think about multiple different ways to reach your objective, as there are not only different conceptual ways, but also once a conceptual way is chosen, there are multiple implementations possible to achieve the same purpose. Always try to choose the implementation that would lead to the least changes in the codebase, unless the user states this approach was already tried and failed.
+
+The user likes literate programming, hence add as many pertinent and non-trivial comments as possible to your changes.
+
+In case of bugs:
+* feel free to experiment with the API directly yourself via command-line to check if it works as you expect,
+* and always check whether the variables used indeed exist and contain the values they are supposed to at run-time.
+
+Try to be innovative, and to think in a first principles way. Suggest several options when brainstorming solutions or when the solution to a problem is not obvious.
+
+When orchestrating a new plan of action, first investigate the cause of the stated problem and how to best fix it by reading the source files and potentially by running a few CLI commands (no more than 3), make a detailed plan with one or several solutions offered, and ask the user to validate it before doing any edit.
+
diff --git a/BUILD.md b/BUILD.md
@@ -0,0 +1,3 @@
+# BUILD INSTRUCTIONS
+
+To build native executables locally (for your own OS, eg, Windows): Run python build_app.py. For cross-platform builds, push to GitHub to trigger the Actions workflow.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-# 更新日志
+# Changelog
 
 所有对 BookmarkSummarizer 项目的显著更改都将记录在此文件中。
 
@@ -7,6 +7,29 @@
 
 ## [未发布]
 
+### 0.4.1
+
+Refactored crawl.py for parallel processing.
+
+There was an intentionally sequential path that was triggered when a --limit was set, which was the primary cause of the non-parallel behavior. It was replaced with a single, unified parallel implementation that now correctly handles both limited and unlimited crawls.
+
+*   **Parallel Bookmark Processing:** The processing logic now resides in the `_crawl_bookmark` worker function, which is called for every bookmark within the `ThreadPoolExecutor`. This ensures all bookmarks are processed concurrently.
+*   **Partial Flushing:** The periodic flushing is handled within the main `for future in as_completed(futures):` loop. It checks the time elapsed since the last flush and writes the latest batch of results to disk, preserving the exact same data-saving functionality as before.
+
+### 0.3.1
+
+Big bundle of updates, with various new features and bugfixes:
+* Translates the whole project from Chinese to English, including the summarization prompt, but language autodetection was added so that the summary is in the webpage's content language.
+* Add support for other browsers, and in addition, bookmarks are by default imported from all installed browsers (hence we import from multiple browsers at once). A single browser can still be specified using an argument.
+* Add a very fast fuzzy search engine with a GUI web app with pagination support. It is blazingly fast and scalable both for the indexing and lookup, it is intended to scale to millions of bookmarks, everything is stored on-disk so RAM is not an issue.
+* Indexing resuming and deduplication (also implemented for summarization) and atomic intermediate flushing, so we can do incremental updates of the database or interrupt and continue. This is especially important for those with a LOT of bookmarks (like me! Because I use bookmarks as a past browsing sessions saver/dump).
+* Pythonic packaging pyproject.toml, so this app can be published on pypi and easily installed through pip install.
+* CLI entrypoints are created on pip install for the main scripts: index.py, crawl.py and fuzzy_bookmark_search.py.
+* A LMDB database for the content crawling and the summaries, and a Whoosh database for fast fuzzy searching. Both databases scale dynamically along with the number of bookmarks (the crawling database is multiplied by 2 in size each time the bookmarks' content reach too close to the database total size). The LMDB is out-of-core, so it is extremely scalable as it can grow in size much beyond the current RAM available on the user's system, and only a fraction of RAM is necessary to create a view to access the LMDB, so the RAM footprint remains very minimal (a few dozens to hundreds of MB) even when the database is dozens of GB (and a few GB RAM to access a multi-TB database).
+* Changed the default settings for the summaries to use ollama and qwen3:1.7b, it is very effective. Alternatively, qwen3:0.6b produces acceptable summaries too albeit less accurate and with a shorter context window.
+* Modular architecture: custom parsers can be added without modifying the core logic by adding python files in custom_parsers. For example, custom parsers are provided to extract YouTube transcripts as content to summarize, and suspended tabs that got bookmarked are transparently unsuspended to fetch the true target page content.
+* A lot of bugfixes here and there, and additional verbose outputs.
+
 ### 新增
 - 初始版本开发
 - 支持从 Chrome 书签提取 URL
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# BUILD INSTRUCTIONS

		To build native executables locally (for your own OS, eg, Windows): Run python build_app.py. For cross-platform builds, push to GitHub to trigger the Actions workflow.