From 74d19680426a1f364c6fc58a291ff949fe9c85ae Mon Sep 17 00:00:00 2001 From: bilgeyucel Date: Tue, 8 Jul 2025 14:55:47 +0200 Subject: [PATCH 1/2] Update name and generators --- index.toml | 2 +- notebooks/apify_haystack_rag.ipynb | 107 +++++++++++++++++------------ 2 files changed, 64 insertions(+), 45 deletions(-) diff --git a/index.toml b/index.toml index e26ea87..b5ceb51 100644 --- a/index.toml +++ b/index.toml @@ -14,7 +14,7 @@ notebook = "amazon_sagemaker_and_chroma_for_qa.ipynb" topics = ["RAG"] [[cookbook]] -title = "RAG: Extract and use website content for question answering with Apify-Haystack integration" +title = "Crawl Website Content for Question Answering with Apify" notebook = "apify_haystack_rag.ipynb" topics = ["RAG", "Web-QA"] diff --git a/notebooks/apify_haystack_rag.ipynb b/notebooks/apify_haystack_rag.ipynb index 2a874c1..ae2431d 100644 --- a/notebooks/apify_haystack_rag.ipynb +++ b/notebooks/apify_haystack_rag.ipynb @@ -6,7 +6,7 @@ "id": "t1BeKtSo7KzI" }, "source": [ - "# RAG: Extract and use website content for question answering with Apify-Haystack integration\n", + "# Crawl Website Content for Question Answering with Apify\n", "\n", "Author: Jiri Spilka ([Apify](https://apify.com/jiri.spilka))\n", "\n", @@ -20,11 +20,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "r5AJeMOE1Cou" + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "r5AJeMOE1Cou", + "outputId": "4b47b345-bc7f-4e07-f69d-a9bbc324b5b6" }, "outputs": [], "source": [ - "!pip install apify-haystack haystack-ai" + "!pip install -q apify-haystack" ] }, { @@ -42,13 +46,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yiUTwYzP36Yr", - "outputId": "b53b07db-42a9-4109-e322-705a8312da2e" + "outputId": "28c743be-d94f-447f-8839-7cdd5cf258be" }, "outputs": [ { @@ -85,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "id": "_AYgcfBx681h" }, @@ -109,12 +113,12 @@ "```\n", "[\n", " {\n", - " \"url\": \"https://haystack.deepset.ai/\",\n", - " \"text\": \"Haystack | Haystack - Multimodal - AI - Architect a next generation AI app around all modalities, not just text ...\"\n", + " \"url\": \"https://haystack.deepset.ai/overview/quick-start\",\n", + " \"text\": \"Haystack is an open-source AI framework to build custom production-grade LLM ...\"\n", " },\n", " {\n", - " \"url\": \"https://haystack.deepset.ai/tutorials/24_building_chat_app\",\n", - " \"text\": \"Building a Conversational Chat App ... \"\n", + " \"url\": \"https://haystack.deepset.ai/cookbook\",\n", + " \"text\": \"You can use these examples as guidelines on how to make use of different mod... \"\n", " },\n", "]\n", "```\n", @@ -124,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "id": "OZ0PAVHI_mhn" }, @@ -147,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "gdN7baGrA_lR" }, @@ -173,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "zKr0KTfhAQz6" }, @@ -198,32 +202,46 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "id": "qfaWI6BaAko9" + }, + "outputs": [], + "source": [ + "# Crawler website and store documents in the document_store\n", + "# Crawling will take some time (1-2 minutes), you can monitor progress in the https://console.apify.com/actors/runs\n", + "\n", + "docs = apify_dataset_loader.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "qfaWI6BaAko9", - "outputId": "ba5e115e-4c9d-42fd-c167-0bf06163d52c" + "id": "LXcfam6pFJG-", + "outputId": "70b23d91-6ac4-4daa-a422-c636546225f6" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'documents': [Document(id=6c4d570874ff59ed4e06017694bee8a72d766d2ed55c6453fbc9ea91fd2e6bde, content: 'Haystack | Haystack Luma Β· Delightful Events Start HereAWS Summit Berlin 2023: Building Generative A...', meta: {'url': 'https://haystack.deepset.ai/'}), Document(id=d420692bf66efaa56ebea200a4a63597667bdc254841b99654239edf67737bcb, content: 'Tutorials & Walkthroughs | Haystack\n", - "Tutorials & Walkthroughs2.0\n", - "Whether you’re a beginner or an expe...', meta: {'url': 'https://haystack.deepset.ai/tutorials'}), Document(id=5a529a308d271ba76f66a060c0b706b73103406ac8a853c19f20e1594823efe8, content: 'Get Started | Haystack\n", - "Haystack is an open-source Python framework that helps developers build LLM-p...', meta: {'url': 'https://haystack.deepset.ai/overview/quick-start'}), Document(id=1d126a03ae50586729846d492e9e8aca802d7f281a72a8869ded08ebc5585a36, content: 'What is Haystack? | Haystack\n", - "Haystack is an open source framework for building production-ready LLM ...', meta: {'url': 'https://haystack.deepset.ai/overview/intro'}), Document(id=4324a62242590d4ecf9b080319607fa1251aa0822bbe2ce6b21047e783999703, content: 'Integrations | Haystack\n", - "The Haystack ecosystem integrates with many other technologies, such as vect...', meta: {'url': 'https://haystack.deepset.ai/integrations'})]}\n" + "{'documents': [Document(id=3650d4d2050c97d0b20d6bb9202eb72494e2dc6ad0222a7e4a7bad038780ab31, content: 'Haystack | Haystack\n", + "Multimodal\n", + "AI\n", + "Architect a next generation AI app around all modalities, not just...', meta: {'url': 'https://haystack.deepset.ai/'}, embedding: vector of size 1536), Document(id=a441728f7b8c8f7541304f23be229372f526306c6d39f634fecf245923d2f239, content: 'What is Haystack? | Haystack\n", + "Haystack is an open-source AI orchestration framework built by deepset ...', meta: {'url': 'https://haystack.deepset.ai/overview/intro'}, embedding: vector of size 1536), Document(id=82282e7eb3115bf0e8efbaaa4de70fd68bcd1bebf25218a68973c3441ff9638f, content: 'Demos | Haystack\n", + "Check out demos built with Haystack!\n", + "AutoQuizzer\n", + "Try out our AutoQuizzer demo built...', meta: {'url': 'https://haystack.deepset.ai/overview/demo'}, embedding: vector of size 1536), Document(id=55f775825a43a52c8f51f4ba08713389a652e05eb992ed15d7c18bbe68bbe38a, content: 'Get Started | Haystack\n", + "Haystack is an open-source AI framework to build custom production-grade LLM ...', meta: {'url': 'https://haystack.deepset.ai/overview/quick-start'}, embedding: vector of size 1536), Document(id=1b7ed59f60d536b9e1903b9c66f86e942bfed9bab5ae9f32dcecc6645b95daab, content: 'πŸ§‘β€πŸ³ Cookbook | Haystack\n", + "You can use these examples as guidelines on how to make use of different mod...', meta: {'url': 'https://haystack.deepset.ai/cookbook'}, embedding: vector of size 1536)]}\n" ] } ], "source": [ - "# Crawler website and store documents in the document_store\n", - "# Crawling will take some time (1-2 minutes), you can monitor progress in the https://console.apify.com/actors/runs\n", - "\n", - "docs = apify_dataset_loader.run()\n", "print(docs)" ] }, @@ -238,20 +256,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YrKAkHLuCp6N", - "outputId": "a0234fa4-1265-4212-be6a-f844708126e3" + "outputId": "2df583d4-7f65-4ec5-a94f-7a756e06d3e4" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Calculating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 3.29it/s]\n" + "Calculating embeddings: 1it [00:01, 1.07s/it]\n" ] }, { @@ -260,7 +278,7 @@ "5" ] }, - "execution_count": 29, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -283,13 +301,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "31W_jlNWFkz3", - "outputId": "af4ddf9a-3ea1-4d4a-bb08-bd2d828fe517" + "outputId": "f23e06af-1fd9-4e6f-c276-26e8ff9056a8" }, "outputs": [ { @@ -302,33 +320,34 @@ { "data": { "text/plain": [ - "\n", + "\n", "πŸš… Components\n", " - embedder: OpenAITextEmbedder\n", " - retriever: InMemoryEmbeddingRetriever\n", - " - prompt_builder: PromptBuilder\n", - " - llm: OpenAIGenerator\n", + " - prompt_builder: ChatPromptBuilder\n", + " - llm: OpenAIChatGenerator\n", "πŸ›€οΈ Connections\n", " - embedder.embedding -> retriever.query_embedding (List[float])\n", " - retriever.documents -> prompt_builder.documents (List[Document])\n", - " - prompt_builder.prompt -> llm.prompt (str)" + " - prompt_builder.prompt -> llm.messages (List[ChatMessage])" ] }, - "execution_count": 30, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from haystack import Pipeline\n", - "from haystack.components.builders import PromptBuilder\n", + "from haystack.components.builders import ChatPromptBuilder\n", "from haystack.components.embedders import OpenAITextEmbedder\n", - "from haystack.components.generators import OpenAIGenerator\n", + "from haystack.components.generators.chat import OpenAIChatGenerator\n", "from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever\n", + "from haystack.dataclasses import ChatMessage\n", "\n", "text_embedder = OpenAITextEmbedder()\n", "retriever = InMemoryEmbeddingRetriever(document_store)\n", - "generator = OpenAIGenerator(model=\"gpt-4o-mini\")\n", + "generator = OpenAIChatGenerator(model=\"gpt-4o-mini\")\n", "\n", "template = \"\"\"\n", "Given the following information, answer the question.\n", @@ -342,7 +361,7 @@ "Answer:\n", "\"\"\"\n", "\n", - "prompt_builder = PromptBuilder(template=template)\n", + "prompt_builder = ChatPromptBuilder(template=[ChatMessage.from_user(template)], required_variables=\"*\")\n", "\n", "# Add components to your pipeline\n", "print(\"Initializing pipeline...\")\n", @@ -370,13 +389,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uPtoRZEdF1BN", - "outputId": "cb8d2266-6274-42b8-cf25-765c2d3de62f" + "outputId": "f0e63536-f4e1-4d04-c6b7-75c872b71749" }, "outputs": [ { @@ -384,7 +403,7 @@ "output_type": "stream", "text": [ "question: What is haystack?\n", - "answer: Haystack is an open-source Python framework designed to help developers build LLM-powered custom applications. It is used for creating production-ready LLM applications, retrieval-augmented generative pipelines, and state-of-the-art search systems that work effectively over large document collections. Haystack offers comprehensive tooling for developing AI systems that use LLMs from platforms like Hugging Face, OpenAI, Cohere, Mistral, and more. It provides a modular and intuitive framework that allows users to quickly integrate the latest AI models, offering flexibility and ease of use. The framework includes components and pipelines that enable developers to build end-to-end AI projects without the need to understand the underlying models deeply. Haystack caters to LLM enthusiasts and beginners alike, providing a vibrant open-source community for collaboration and learning.\n" + "answer: Haystack is an open-source AI orchestration framework developed by deepset that enables Python developers to create real-world applications using large language models (LLMs). It provides tools for building various types of applications, including autonomous agents, multi-modal apps, and scalable retrieval-augmented generation (RAG) systems. Haystack's modular architecture allows users to customize components, experiment with state-of-the-art methods, and manage their technology stack effectively. It caters to developers at all levels, from prototyping to full-scale deployment, and is supported by a community that values open-source collaboration. Haystack can be utilized directly in Python or through a visual interface called deepset Studio.\n" ] } ], @@ -394,7 +413,7 @@ "response = pipe.run({\"embedder\": {\"text\": question}, \"prompt_builder\": {\"question\": question}})\n", "\n", "print(f\"question: {question}\")\n", - "print(f\"answer: {response['llm']['replies'][0]}\")" + "print(f\"answer: {response['llm']['replies'][0].text}\")" ] } ], From 33c212884c8e81275a43b3ac5c239770b09dec44 Mon Sep 17 00:00:00 2001 From: bilgeyucel Date: Tue, 8 Jul 2025 15:05:39 +0200 Subject: [PATCH 2/2] Update the instagram vibe checker notebook --- ...haystack_instagram_comments_analysis.ipynb | 84 ++++++++++--------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/notebooks/apify_haystack_instagram_comments_analysis.ipynb b/notebooks/apify_haystack_instagram_comments_analysis.ipynb index abd3618..f4c5319 100644 --- a/notebooks/apify_haystack_instagram_comments_analysis.ipynb +++ b/notebooks/apify_haystack_instagram_comments_analysis.ipynb @@ -30,16 +30,12 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, "collapsed": true, - "id": "r5AJeMOE1Cou", - "outputId": "63663073-ccc5-4306-ae18-e2720d937407" + "id": "r5AJeMOE1Cou" }, "outputs": [], "source": [ - "!pip install apify-haystack==0.1.4 haystack-ai" + "!pip install -q apify-haystack" ] }, { @@ -63,7 +59,7 @@ "base_uri": "https://localhost:8080/" }, "id": "yiUTwYzP36Yr", - "outputId": "d79acadc-bd18-44d3-c812-9b40c51d5124" + "outputId": "4a34577d-f3f1-4508-80f5-d367683ad017" }, "outputs": [ { @@ -189,34 +185,35 @@ "base_uri": "https://localhost:8080/" }, "id": "gdN7baGrA_lR", - "outputId": "b73b1217-3082-4da7-c824-b8671eeef78d" + "outputId": "fdbcd795-8bbe-4652-922b-f74d47fdb115" }, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ - "\n", + "\n", "πŸš… Components\n", " - loader: ApifyDatasetFromActorCall\n", " - cleaner: DocumentCleaner\n", - " - prompt_builder: PromptBuilder\n", - " - llm: OpenAIGenerator\n", + " - prompt_builder: ChatPromptBuilder\n", + " - llm: OpenAIChatGenerator\n", "πŸ›€οΈ Connections\n", " - loader.documents -> cleaner.documents (list[Document])\n", " - cleaner.documents -> prompt_builder.documents (List[Document])\n", - " - prompt_builder.prompt -> llm.prompt (str)" + " - prompt_builder.prompt -> llm.messages (List[ChatMessage])" ] }, - "execution_count": 5, "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], "source": [ "from haystack import Pipeline\n", - "from haystack.components.builders import PromptBuilder\n", - "from haystack.components.generators import OpenAIGenerator\n", + "from haystack.components.builders import ChatPromptBuilder\n", + "from haystack.components.generators.chat import OpenAIChatGenerator\n", "from haystack.components.preprocessors import DocumentCleaner\n", + "from haystack.dataclasses import ChatMessage\n", "\n", "prompt = \"\"\"\n", "Analyze these Instagram comments to determine if the post is generating positive energy, excitement,\n", @@ -232,8 +229,8 @@ "\"\"\"\n", "\n", "cleaner = DocumentCleaner(remove_empty_lines=True, remove_extra_whitespaces=True, remove_repeated_substrings=True)\n", - "prompt_builder = PromptBuilder(template=prompt)\n", - "generator = OpenAIGenerator(model=\"gpt-4o-mini\")\n", + "prompt_builder = ChatPromptBuilder(template=[ChatMessage.from_user(prompt)], required_variables=\"*\")\n", + "generator = OpenAIChatGenerator(model=\"gpt-4o-mini\")\n", "\n", "\n", "pipe = Pipeline()\n", @@ -257,37 +254,46 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { + "id": "qfaWI6BaAko9" + }, + "outputs": [], + "source": [ + "# \\@tiffintech on How to easily keep up with tech?\n", + "url = \"https://www.instagram.com/p/C_a9jcRuJZZ/\"\n", + "\n", + "res = pipe.run({\"loader\": {\"run_input\": {\"directUrls\": [url]}}})" + ] + }, + { + "cell_type": "code", + "source": [ + "res.get(\"llm\", {}).get(\"replies\", [\"No response\"])[0].text" + ], + "metadata": { + "id": "SZYtTd7TImGM", + "outputId": "07312231-505c-400f-e58e-a3fc2ba5f6b3", "colab": { "base_uri": "https://localhost:8080/", - "height": 72 - }, - "id": "qfaWI6BaAko9", - "outputId": "25e33c1b-f8b9-4b6d-a3d9-0eb54365b820" + "height": 140 + } }, + "execution_count": 8, "outputs": [ { + "output_type": "execute_result", "data": { + "text/plain": [ + "'The Instagram comments exhibit a strongly positive sentiment and emotional tone, indicating that the post is indeed generating high energy and excitement. The use of enthusiastic emojis (e.g., 🫢, πŸ”₯, 😁) and expressions of gratitude (e.g., \"Thank you!\" and \"great resource!\") suggest that the audience is actively engaging with the content in an uplifting manner.\\n\\nKey engagement patterns include:\\n\\n- **Expressions of Appreciation**: Multiple comments acknowledge the resource as beneficial, with users expressing gratitude and sharing their positive experiences.\\n- **Community Interaction**: Users mention and tag others (@tiffintech, @dailydotdev), indicating a vibrant community dynamic and a willingness to share the content with friends or followers.\\n- **Curiosity and Requests**: Questions about additional content (e.g., \"Link please?\" and \"would love a breakdown\") show that the audience is highly engaged and eager for more information.\\n\\nOverall, the comments reflect a post that is \"vibrating\" with high energy, as evidenced by the positive feedback, community connections, and engaged responses.'" + ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" - }, - "text/plain": [ - "'Overall, the Instagram comments on the post reflect positive energy, excitement, and high engagement. The use of emojis such as πŸ˜‚, 😍, πŸ™Œ, ❀️, and πŸ”₯ indicate enthusiasm and excitement. Many comments express gratitude, appreciation, and eagerness to explore the resources mentioned in the post. There are also interactions between users tagging each other and discussing their interest in the topic, further increasing engagement. Overall, the post seems to be generating high energy and positive vibes from the audience.'" - ] + } }, - "execution_count": 6, "metadata": {}, - "output_type": "execute_result" + "execution_count": 8 } - ], - "source": [ - "# \\@tiffintech on How to easily keep up with tech?\n", - "url = \"https://www.instagram.com/p/C_a9jcRuJZZ/\"\n", - "\n", - "res = pipe.run({\"loader\": {\"run_input\": {\"directUrls\": [url]}}})\n", - "res.get(\"llm\", {}).get(\"replies\", [\"No response\"])[0]\n", - "\n" ] }, { @@ -301,7 +307,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -365,4 +371,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file