From ba2aa9ee1f37a3bb237b7285c178974a185bca78 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 14 May 2025 08:52:00 -0500 Subject: [PATCH 1/4] Some updates to python readme --- python/README.md | 67 +++++++++++++++++++++++++++---------------- python/README.qmd | 72 ++++++++++++++++++++++++++--------------------- 2 files changed, 83 insertions(+), 56 deletions(-) diff --git a/python/README.md b/python/README.md index b51a59b..d1f66b7 100644 --- a/python/README.md +++ b/python/README.md @@ -4,21 +4,24 @@ +[![PyPi](https://img.shields.io/pypi/v/mlverse-mall.png)](https://pypi.org/project/mlverse-mall/) [![Python tests](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml/badge.svg)](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml) -[![Code +\| [![Package coverage](https://codecov.io/gh/mlverse/mall/branch/main/graph/badge.svg)](https://app.codecov.io/gh/mlverse/mall?branch=main) -[![Lifecycle: -experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) + -Run multiple LLM predictions against a data frame. The predictions are -processed row-wise over a specified column. It works using a -pre-determined one-shot prompt, along with the current row’s content. -`mall` has been implemented for both R and Python. The prompt that is -use will depend of the type of analysis needed. +Use Large Language Models (LLM) to run Natural Language Processing (NLP) +operations against your data. It takes advantage of the LLMs general +language training in order to get the predictions, thus removing the +need to train a new NLP model. `mall` is available for R and Python. -Currently, the included prompts perform the following: +It works by running multiple LLM predictions against your data. The +predictions are processed row-wise over a specified column. It relies on +the “one-shot” prompt technique to instruct the LLM on a particular NLP +operation to perform. The package includes prompts to perform the +following specific NLP operations: - [Sentiment analysis](#sentiment) - [Text summarizing](#summarize) @@ -26,18 +29,14 @@ Currently, the included prompts perform the following: - [Extract one, or several](#extract), specific pieces information from the text - [Translate text](#translate) -- [Verify that something it true](#verify) about the text (binary) -- [Custom prompt](#custom-prompt) +- [Verify that something is true](#verify) about the text (binary) -This package is inspired by the SQL AI functions now offered by vendors -such as -[Databricks](https://docs.databricks.com/en/large-language-models/ai-functions.html) -and Snowflake. `mall` uses [Ollama](https://ollama.com/) to interact -with LLMs installed locally. +For other NLP operations, `mall` offers the ability for you to [write +your own prompt](#custom-prompt). -For **Python**, `mall` is a library extension to -[Polars](https://pola.rs/). To interact with Ollama, it uses the -official [Python library](https://github.com/ollama/ollama-python). +`mall` is a library extension to [Polars](https://pola.rs/). To interact +with Ollama, it uses the official [Python +library](https://github.com/ollama/ollama-python). ``` python reviews.llm.sentiment("review") @@ -45,9 +44,13 @@ reviews.llm.sentiment("review") ## Motivation -We want to new find ways to help data scientists use LLMs in their daily -work. Unlike the familiar interfaces, such as chatting and code +We want to new find new ways to help data scientists use LLMs in their +daily work. Unlike the familiar interfaces, such as chatting and code completion, this interface runs your text data directly against the LLM. +This package is inspired by the SQL AI functions now offered by vendors +such as +[Databricks](https://docs.databricks.com/en/large-language-models/ai-functions.html) +and Snowflake. The LLM’s flexibility, allows for it to adapt to the subject of your data, and provide surprisingly accurate predictions. This saves the data @@ -55,9 +58,20 @@ scientist the need to write and tune an NLP model. In recent times, the capabilities of LLMs that can run locally in your computer have increased dramatically. This means that these sort of -analysis can run in your machine with good accuracy. Additionally, it -makes it possible to take advantage of LLM’s at your institution, since -the data will not leave the corporate network. +analysis can run in your machine with good accuracy. It also makes it +possible to take advantage of LLMs at your institution, since the data +will not leave the corporate network. Additionally, LLM management and +integration platforms, such as [Ollama](https://ollama.com/), are now +very easy to setup and use. `mall` uses Ollama as to interact with local +LLMs. + +The development version of `mall` lets you **use external LLMs such as +[OpenAI](https://openai.com/), [Gemini](https://gemini.google.com/) and +[Anthropic](https://www.anthropic.com/)**. In R, `mall` uses the +[`ellmer`](https://ellmer.tidyverse.org/index.html) package to integrate +with the external LLM, and the +[`chatlas`](https://posit-dev.github.io/chatlas/) package to integrate +in Python. ## Get started @@ -99,6 +113,11 @@ reviews = data.reviews reviews ``` + /Users/edgar/Projects/mall/python/.venv/lib/python3.12/site-packages/pydantic/_internal/_fields.py:132: UserWarning: Field "model_format" in ContentToolResult has conflict with protected namespace "model_". + + You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`. + warnings.warn( + | review | |----| | "This has been the best TV I've ever used. Great screen, and sound." | diff --git a/python/README.qmd b/python/README.qmd index 3bebf77..9e70728 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -7,56 +7,64 @@ execute: -[![Python tests](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml/badge.svg)](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml) -[![Code coverage](https://codecov.io/gh/mlverse/mall/branch/main/graph/badge.svg)](https://app.codecov.io/gh/mlverse/mall?branch=main) -[![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) - +[![PyPi](https://img.shields.io/pypi/v/mlverse-mall)](https://pypi.org/project/mlverse-mall/) [![Python tests](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml/badge.svg)](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml) \| [![Package coverage](https://codecov.io/gh/mlverse/mall/branch/main/graph/badge.svg)](https://app.codecov.io/gh/mlverse/mall?branch=main) + + -Run multiple LLM predictions against a data frame. The predictions are processed -row-wise over a specified column. It works using a pre-determined one-shot prompt, -along with the current row's content. `mall` has been implemented for both R -and Python. The prompt that is use will depend of the type of analysis needed. +Use Large Language Models (LLM) to run Natural Language Processing (NLP) +operations against your data. It takes advantage of the LLMs general language +training in order to get the predictions, thus removing the need to train a new +NLP model. `mall` is available for R and Python. -Currently, the included prompts perform the following: +It works by running multiple LLM predictions against your data. The predictions +are processed row-wise over a specified column. It relies on the "one-shot" +prompt technique to instruct the LLM on a particular NLP operation to perform. +The package includes prompts to perform the following specific NLP operations: -- [Sentiment analysis](#sentiment) -- [Text summarizing](#summarize) -- [Classify text](#classify) -- [Extract one, or several](#extract), specific pieces information from the text -- [Translate text](#translate) -- [Verify that something it true](#verify) about the text (binary) -- [Custom prompt](#custom-prompt) +- [Sentiment analysis](#sentiment) +- [Text summarizing](#summarize) +- [Classify text](#classify) +- [Extract one, or several](#extract), specific pieces information from the text +- [Translate text](#translate) +- [Verify that something is true](#verify) about the text (binary) -This package is inspired by the SQL AI functions now offered by vendors such as -[Databricks](https://docs.databricks.com/en/large-language-models/ai-functions.html) -and Snowflake. `mall` uses [Ollama](https://ollama.com/) to interact with LLMs -installed locally. +For other NLP operations, `mall` offers the ability for you to [write your own prompt](#custom-prompt). -For **Python**, `mall` is a library extension to [Polars](https://pola.rs/). To +`mall` is a library extension to [Polars](https://pola.rs/). To interact with Ollama, it uses the official [Python library](https://github.com/ollama/ollama-python). ```python reviews.llm.sentiment("review") ``` - ## Motivation -We want to new find ways to help data scientists use LLMs in their daily work. -Unlike the familiar interfaces, such as chatting and code completion, this interface -runs your text data directly against the LLM. +We want to new find new ways to help data scientists use LLMs in their daily work. +Unlike the familiar interfaces, such as chatting and code completion, this +interface runs your text data directly against the LLM. This package is inspired +by the SQL AI functions now offered by vendors such as [Databricks](https://docs.databricks.com/en/large-language-models/ai-functions.html) +and Snowflake. -The LLM's flexibility, allows for it to adapt to the subject of your data, and -provide surprisingly accurate predictions. This saves the data scientist the -need to write and tune an NLP model. +The LLM's flexibility, allows for it to adapt to the subject of your data, and +provide surprisingly accurate predictions. This saves the data scientist the +need to write and tune an NLP model. In recent times, the capabilities of LLMs that can run locally in your computer -have increased dramatically. This means that these sort of analysis can run -in your machine with good accuracy. Additionally, it makes it possible to take -advantage of LLM's at your institution, since the data will not leave the -corporate network. +have increased dramatically. This means that these sort of analysis can run in +your machine with good accuracy. It also makes it possible to take +advantage of LLMs at your institution, since the data will not leave the +corporate network. Additionally, LLM management and integration platforms, such +as [Ollama](https://ollama.com/), are now very easy to setup and use. `mall` +uses Ollama as to interact with local LLMs. + +The development version of `mall` lets you **use external LLMs such as +[OpenAI](https://openai.com/), [Gemini](https://gemini.google.com/) and +[Anthropic](https://www.anthropic.com/)**. In R, `mall` uses the +[`ellmer`](https://ellmer.tidyverse.org/index.html) +package to integrate with the external LLM, and the +[`chatlas`](https://posit-dev.github.io/chatlas/) package to integrate in Python. ## Get started From 53ca8d76d84a6b312fff6b6fdff32c72e63e3391 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 14 May 2025 09:54:43 -0500 Subject: [PATCH 2/4] [python] Adds support for ollama Client --- python/mall/llm.py | 10 +++++++--- python/mall/polars.py | 14 ++++++++++---- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index a0e01a9..de932a8 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -54,15 +54,19 @@ def llm_call(x, msg, use, valid_resps="", convert=None, data_type=None): hash_call = build_hash(call) cache = cache_check(hash_call, use) - if cache == "": if backend == "chatlas": chat = use.get("chat") ch = chat.chat(msg[0].get("content") + x, echo="none") out = ch.get_content() chat.set_turns(list()) - if backend == "ollama": - resp = ollama.chat( + if backend == "ollama" or backend == "ollama-client": + if backend == "ollama": + chat_fun = ollama.chat + else: + client = use.get("client") + chat_fun = client.chat + resp = chat_fun( model=use.get("model"), messages=build_msg(x, msg), options=use.get("options"), diff --git a/python/mall/polars.py b/python/mall/polars.py index bb64739..6202cc9 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -1,3 +1,4 @@ +from ollama import Client from chatlas import Chat import polars as pl @@ -45,9 +46,10 @@ def use(self, backend="", model="", _cache="_mall_cache", **kwargs): Parameters ------ - backend : str | Chat - The name of the backend to use, or a `chatlas` chat object. - At the beginning of the session it defaults to "ollama". + backend : str | Chat | Client + The name of the backend to use, or an Ollama Client object, + or a `chatlas` Chat object. + At the beginning of the session it defaults to "ollama". If passing `""`, it will remain unchanged model : str The name of the model tha the backend should use. At the beginning @@ -87,7 +89,7 @@ def use(self, backend="", model="", _cache="_mall_cache", **kwargs): ``` ```{python} - # Use a `chatlas` object + # Use a `chatlas` object from chatlas import ChatOpenAI chat = ChatOpenAI() reviews.llm.use(chat) @@ -98,6 +100,10 @@ def use(self, backend="", model="", _cache="_mall_cache", **kwargs): self._use.update(dict(chat=backend)) backend = "" model = "" + if isinstance(backend, Client): + self._use.update(dict(backend="ollama-client")) + self._use.update(dict(client=backend)) + backend = "" if backend != "": self._use.update(dict(backend=backend)) if model != "": From c7d754ccf573a29ff2b2e21e5b30e73571a445b1 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 14 May 2025 09:57:12 -0500 Subject: [PATCH 3/4] [py] Version bump --- python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index a1ad3c1..5807592 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -3,7 +3,7 @@ packages = ["mall"] [project] name = "mlverse-mall" -version = "0.1.0.9000" +version = "0.1.0.9001" description = "Run multiple 'Large Language Model' predictions against a table. The predictions run row-wise over a specified column." readme = "README.md" authors = [ From b7cb43b90900dcf78b6f620023c0dcfeddab6ebe Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 28 May 2025 15:18:46 -0500 Subject: [PATCH 4/4] Updates test snapshots --- r/man/llm_use.Rd | 1 - r/tests/testthat/_snaps/llm-use.md | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/r/man/llm_use.Rd b/r/man/llm_use.Rd index 2ae25fb..fd84f80 100644 --- a/r/man/llm_use.Rd +++ b/r/man/llm_use.Rd @@ -69,6 +69,5 @@ llm_use(.silent = TRUE) library(ellmer) chat <- chat_openai(model = "gpt-4o") llm_use(chat) - } } diff --git a/r/tests/testthat/_snaps/llm-use.md b/r/tests/testthat/_snaps/llm-use.md index b18e5ec..c1c0581 100644 --- a/r/tests/testthat/_snaps/llm-use.md +++ b/r/tests/testthat/_snaps/llm-use.md @@ -26,7 +26,7 @@ -- mall session object Backend: ellmer - LLM session: model:gpt-4o + LLM session: model:gpt-4.1 # Ensures empty llm_use works with Chat @@ -36,5 +36,5 @@ -- mall session object Backend: ellmer - LLM session: model:gpt-4o + LLM session: model:gpt-4.1