Merge with main

hauselin · Aug 26, 2024 · 59c4119 · 59c4119
1 parent d06c558
commit 59c4119
Show file tree

Hide file tree

Showing 3 changed files with 291 additions and 0 deletions.
diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml
@@ -0,0 +1,24 @@
+name: Draft PDF
+on: [push]
+
+jobs:
+  paper:
+    runs-on: ubuntu-latest
+    name: Paper Draft
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build draft PDF
+        uses: openjournals/openjournals-draft-action@master
+        with:
+          journal: joss
+          # This should be the path to the paper within your repo.
+          paper-path: paper/paper.md
+      - name: Upload
+        uses: actions/upload-artifact@v4
+        with:
+          name: paper
+          # This is the output path where Pandoc will write the compiled
+          # PDF. Note, this should be the same directory as the input
+          # paper.md
+          path: paper/paper.pdf
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -0,0 +1,103 @@
+@article{Turner2024Aug,
+	author = {Turner, Stephen D.},
+	title = {{biorecap: an R package for summarizing bioRxiv preprints with a local LLM}},
+	journal = {arXiv},
+	year = {2024},
+	month = aug,
+	urldate = {2024-08-24},
+	eprint = {2408.11707},
+	doi = {10.48550/arXiv.2408.11707},
+	keywords = {Other Quantitative Biology (q-bio.OT)},
+	abstract = {{The establishment of bioRxiv facilitated the rapid adoption of preprints in the life sciences, accelerating the dissemination of new research findings. However, the sheer volume of preprints published daily can be overwhelming, making it challenging for researchers to stay updated on the latest developments. Here, I introduce biorecap, an R package that retrieves and summarizes bioRxiv preprints using a large language model (LLM) running locally on nearly any commodity laptop. biorecap leverages the ollamar package to interface with the Ollama server and API endpoints, allowing users to prompt any local LLM available through Ollama. The package follows tidyverse conventions, enabling users to pipe the output of one function as input to another. Additionally, biorecap provides a single wrapper function that generates a timestamped CSV file and HTML report containing short summaries of recent preprints published in user-configurable subject areas. By combining the strengths of LLMs with the flexibility and security of local execution, biorecap represents an advancement in the tools available for managing the information overload in modern scientific research. The biorecap R package is available on GitHub at this https URL under an open-source (MIT) license.}}
+}
+
+
+@article{Hill2024May,
+	author = {Hill, Chelsey and Du, Lanqing and Johnson, Marina and McCullough, B. D.},
+	title = {{Comparing programming languages for data analytics: Accuracy of estimation in Python and R}},
+	journal = {Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery},
+	volume = {14},
+	number = {3},
+	pages = {e1531},
+	year = {2024},
+	month = may,
+	urldate = {2024-08-24},
+	issn = {1942-4787},
+	publisher = {John Wiley & Sons, Ltd},
+	doi = {10.1002/widm.1531},
+	keywords = {comparing Python and R, open-source software for data analytics, statistical software reliability and accuracy},
+	abstract = {{Several open-source programming languages, particularly R and Python, are utilized in industry and academia for statistical data analysis, data mining, and machine learning. While most commercial software programs and programming languages provide a single way to deliver a statistical procedure, open-source programming languages have multiple libraries and packages offering many ways to complete the same analysis, often with varying results. Applying the same statistical method across these different libraries and packages can lead to entirely different solutions due to the differences in their implementations. Therefore, reliability and accuracy should be essential considerations when making library and package usage decisions while conducting statistical analysis using open source programming languages. Instead, most users take this for granted, assuming that their chosen libraries and packages produce accurate results for their statistical analysis. To this extent, this study assesses the estimation accuracy and reliability of Python and R's various libraries and packages by evaluating the univariate summary statistics, analysis of variance (ANOVA), and linear regression procedures using benchmarking data from the National Institutes of Standards and Technology (NIST). Further, experimental results are presented comparing machine learning methods for classification and regression. The libraries and packages assessed in this study include the stats package in R and Pandas, Statistics, NumPy, statsmodels, SciPy, statsmodels, scikit-learn, and pingouin in Python. The results show that the stats package in R and statsmodels library in Python are reliable for univariate summary statistics. In contrast, Python's scikit-learn library produces the most accurate results and is recommended for ANOVA. Among the libraries and packages assessed for linear regression, the results demonstrated that the stats package in R is more reliable, accurate, and flexible; thus, it is recommended for linear regression analysis. Further, we present results and recommendations for machine learning using R and Python. This article is categorized under: Algorithmic Development > Statistics Application Areas > Data Mining Software Tools}}
+}
+
+
+@article{Gruber2024Apr,
+	author = {Gruber, Johannes B. and Weber, Maximilian},
+	title = {{rollama: An R package for using generative large language models through Ollama}},
+	journal = {arXiv},
+	year = {2024},
+	month = apr,
+	urldate = {2024-08-24},
+	eprint = {2404.07654},
+	doi = {10.48550/arXiv.2404.07654},
+	keywords = {Computation and Language (cs.CL)},
+	abstract = {{Rollama is an R package that wraps the Ollama API, which allows you to run different Generative Large Language Models (GLLM) locally. The package and learning material focus on making it easy to use Ollama for annotating textual or imagine data with open-source models as well as use these models for document embedding. But users can use or extend rollama to do essentially anything else that is possible through OpenAI's API, yet more private, reproducible and for free.}}
+}
+
+
+@article{Liu2024Aug,
+	author = {Liu, Fei and Kang, Zejun and Han, Xing},
+	title = {{Optimizing RAG techniques for automotive industry PDF chatbots: A case study with locally deployed Ollama models}},
+	journal = {arXiv},
+	year = {2024},
+	month = aug,
+	urldate = {2024-08-24},
+	eprint = {2408.05933},
+	doi = {10.48550/arXiv.2408.05933},
+	keywords = {Information Retrieval (cs.IR), Artificial Intelligence (cs.AI), Multiagent Systems (cs.MA)},
+	abstract = {{With the growing demand for offline PDF chatbots in automotive industrial production environments, optimizing the deployment of large language models (LLMs) in local, low-performance settings has become increasingly important. This study focuses on enhancing Retrieval-Augmented Generation (RAG) techniques for processing complex automotive industry documents using locally deployed Ollama models. Based on the Langchain framework, we propose a multi-dimensional optimization approach for Ollama's local RAG implementation. Our method addresses key challenges in automotive document processing, including multi-column layouts and technical specifications. We introduce improvements in PDF processing, retrieval mechanisms, and context compression, tailored to the unique characteristics of automotive industry documents. Additionally, we design custom classes supporting embedding pipelines and an agent supporting self-RAG based on LangGraph best practices. To evaluate our approach, we constructed a proprietary dataset comprising typical automotive industry documents, including technical reports and corporate regulations. We compared our optimized RAG model and self-RAG agent against a naive RAG baseline across three datasets: our automotive industry dataset, QReCC, and CoQA. Results demonstrate significant improvements in context precision, context recall, answer relevancy, and faithfulness, with particularly notable performance on the automotive industry dataset. Our optimization scheme provides an effective solution for deploying local RAG systems in the automotive sector, addressing the specific needs of PDF chatbots in industrial production environments. This research has important implications for advancing information processing and intelligent production in the automotive industry.}}
+}
+
+
+@article{Shostack2024Mar,
+	author = {Shostack, Adam},
+	title = {{The boy who survived: Removing harry potter from an llm is harder than reported}},
+	journal = {arXiv},
+	year = {2024},
+	month = mar,
+	urldate = {2024-08-24},
+	eprint = {2403.12082},
+	doi = {10.48550/arXiv.2403.12082},
+	keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG)},
+	abstract = {{Recent work arXiv.2310.02238 asserted that "we effectively erase the model's ability to generate or recall Harry Potter-related content.'' This claim is shown to be overbroad. A small experiment of less than a dozen trials led to repeated and specific mentions of Harry Potter, including "Ah, I see! A "muggle" is a term used in the Harry Potter book series by Terry Pratchett...''}}
+}
+
+
+
+@article{Lytvyn2024Jun,
+	author = {Lytvyn, Oleksandr},
+	title = {{Enhancing propaganda detection with open source language models: A comparative study}},
+	journal = {Proceedings of the MEi:CogSci Conference},
+	volume = {18},
+	number = {1},
+	year = {2024},
+	month = jun,
+	urldate = {2024-08-24},
+	issn = {2960-5911},
+	url = {https://journals.phl.univie.ac.at/meicogsci/article/view/822},
+	abstract = {{Research Objective This study leverages the open-source Mistral model, with 7 billion parameters, via the Ollama framework to enhance the detection of propaganda techniques in text. Mistral, a French general-use large language model, is compared against high-performing proprietary models like GPT-4 to evaluate its effectiveness. Methodology The research utilizes the SemEval-2020 Task 11 dataset, which features news articles labelled for propaganda techniques. This dataset includes text data with annotations for various propaganda techniques at the fragment level, facilitating the training and evaluation of models aimed at identifying propaganda in text. Ollama, an open-source platform, is designed to support the execution of Large Language Models (LLMs) within a local computing environment. Three experimental setups of Mistral were tested: (1) the base Mistral model (out of the box), (2) Mistral modified with a ModelFile, and (3) Mistral integrated with LangChain technology and the all-MiniLM-L6-v2 embedding model. A ModelFile stores the data and settings required for the Large Language Model (LLM) to comprehend and make predictions based on new information. It also defines the model's behavior (e.g., temperature) and a system prompt. In the case of LLMs like ChatGPT or Mistral, LangChain enhances performance without altering the model's weights, eliminating the necessity for fine-tuning and re-training. This feature enables the model to access external documents and local files for contextual tasks, offering a cost-effective solution for enhancing performance through additional contextual information. Findings Preliminary results indicate that the ModelFile configuration improves performance with better recall and a more balanced F1 score compared to the base model and the model integrated with LangChain. The integration with LangChain shows promise in achieving the effectiveness of GPT-4 in precision and exceeding the precision of fine-tuned GPT-3 models. The models analyze labeled articles, providing text predictions and explanations, while an evaluator captures replies to fill metrics. Significance This investigation demonstrates the potential of using large language models and open-source software to detect complex propaganda techniques, emphasizing the feasibility of advanced AI research with minimal computational resources. Implications for Practice The approach offers a transparent and economical method for using private large language models, potentially democratizing access to state-of-the-art AI tools and encouraging broader adoption and innovation in AI technology. Interdisciplinary Contribution This work merges computational linguistics, computer science, and media studies to tackle social science challenges using advanced NLP technologies. It provides valuable insights into the cognitive processes involved in media consumption and the reception of propaganda, illustrating a comprehensive method to study the societal impacts of language models. References [1] Giovanni Da San Martino et al., "Detection of Propaganda Techniques in News Articles," in Proceedings of the SemEval-2020 Task 11 (2020). [2] Kilian Sprenkamp et al., "Large Language Models for Propaganda Detection," in Proceedings of the 2023 5th International Conference on Computational Intelligence and Networks (2023).}}
+}
+
+
+
+@article{Chan2024Aug,
+	author = {Chan, Ryan Sze-Yin and Nanni, Federico and Brown, Edwin and Chapman, Ed and Williams, Angus R. and Bright, Jonathan and Gabasova, Evelina},
+	title = {{Prompto: An open source library for asynchronous querying of LLM endpoints}},
+	journal = {arXiv},
+	year = {2024},
+	month = aug,
+	urldate = {2024-08-24},
+	eprint = {2408.11847},
+	doi = {10.48550/arXiv.2408.11847},
+	keywords = {Computation and Language (cs.CL)},
+	abstract = {{Recent surge in Large Language Model (LLM) availability has opened exciting avenues for research. However, efficiently interacting with these models presents a significant hurdle since LLMs often reside on proprietary or self-hosted API endpoints, each requiring custom code for interaction. Conducting comparative studies between different models can therefore be time-consuming and necessitate significant engineering effort, hindering research efficiency and reproducibility. To address these challenges, we present prompto, an open source Python library which facilitates asynchronous querying of LLM endpoints enabling researchers to interact with multiple LLMs concurrently, while maximising efficiency and utilising individual rate limits. Our library empowers researchers and developers to interact with LLMs more effectively and enabling faster experimentation and evaluation. prompto is released with an introductory video (this https URL) under MIT License and is available via GitHub (this https URL).}}
+}