ai4eic · tapasig · Mar 6, 2025 · Mar 11, 2025 · Mar 19, 2025 · Mar 20, 2025
diff --git a/Evaluations/AI4EIC_sample_dataset.csv b/Evaluations/AI4EIC_sample_dataset.csv
@@ -0,0 +1,5 @@
+input_arxiv_id,input_category,input_nclaims,input_question,input_run_trace,output_claims,output_complete_response,output_individual_response,output_nclaims
+2204.08645v1,physics.ins-det,3, What are the main detectors proposed for the Electron-Ion Collider (EIC)? What are the optical processes involved in Cherenkov detectors? How is AI being utilized for imaging Cherenkov detectors at EIC?,https://smith.langchain.com/public/8a7701bd-64b6-4144-967f-35749a27ebf8/r,"[""Main detectors proposed for the Electron-Ion Collider (EIC)"", ""Optical processes involved in Cherenkov detectors"", ""Utilization of AI for imaging Cherenkov detectors at EIC""]"," All detector designs proposed for EIC have a dual radiator ring-imaging Cherenkov detector (dRICH) in the hadron direction, detection of internally reflected Cherenkov light (DIRC) in the barrel, and a modular-aerogel RICH (mRICH) in the electron direction. Cherenkov detectors involve optical processes with many photons that need to be tracked through complex surfaces, making these detectors relatively slow to simulate. AI is being utilized for optimizing the design of Cherenkov detectors and for reducing the computing budget necessary to explore a large number of design points., 
+    ","[""dual radiator ring-imaging Cherenkov detector (dRICH), detection of internally reflected Cherenkov light (DIRC), and a modular-aerogel RICH (mRICH)"", ""Optical processes with many photons that need to be tracked through complex surfaces"", ""AI is being utilized for optimizing the design of Cherenkov detectors and for reducing the computing budget necessary to explore a large number of design points""]",3
+2309.00818v1,physics.ins-det,2, What is the purpose of the high-granularity calorimeter insert for the Electron-Ion Collider (EIC)? What innovative features are used in its design?,https://smith.langchain.com/public/1a32c87e-6944-4241-a37e-a0f512e2db96/r,"[""Purpose of the high-granularity calorimeter insert for the Electron-Ion Collider (EIC)"", ""Innovative features used in the design of the high-granularity calorimeter insert""]"," The purpose of the high-granularity calorimeter insert for the Electron-Ion Collider (EIC) is to cover the range 3.2< 𝜂 < 4.0 and to offer high granularity to enhance performance in measuring jets, and to manage radiation damage and beam-gas interactions effectively. Among its innovative features are an ASIC-away-of-SiPM strategy for reducing cooling requirements and minimizing space use, along with employing 3D-printed frames to reduce optical crosstalk and dead areas., 
+    ","[""To cover the range 3.2< 𝜂 < 4.0 and to offer high granularity"", ""ASIC-away-of-SiPM strategy for reducing cooling requirements and minimizing space use, employing 3D-printed frames to reduce optical crosstalk and dead areas""]",2
diff --git a/Evaluations/RunRAGAS.ipynb b/Evaluations/RunRAGAS.ipynb
diff --git a/Evaluations/test_RunRagas.py b/Evaluations/test_RunRagas.py
@@ -0,0 +1,340 @@
+
+import toml, os, sys
+import time
+import torch  # Added for GPU availability check
+
+# Check if GPU is available
+if not torch.cuda.is_available():
+    print("Warning: GPU not available, falling back to CPU")
+else:
+    print(f"GPU available: {torch.cuda.get_device_name(0)}")
+
+# -------------------------------
+sys.path.append(os.path.realpath("../"))
+
+# with open("../../.streamlit/secrets.toml") as f:
+#     secrets = toml.load(f)
+
+# os.environ["OPENAI_API_KEY"] = secrets["OPENAI_API_KEY"]
+# if secrets.get("LANGCHAIN_API_KEY"):
+langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
+print(langchain_api_key)
+os.environ["LANGCHAIN_API_KEY"] = langchain_api_key
+os.environ["LANGCHAIN_TRACING_V2"] = "true"
+# os.environ["LANGCHAIN_ENDPOINT"] = secrets["LANGCHAIN_ENDPOINT"]
+print(f"langchain_api_key :" , {langchain_api_key})
+
+pinecone_api_key = os.getenv("PINECONE_API_KEY")
+os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY") #secrets["PINECONE_API_KEY"]
+
+#--------------------------
+import nest_asyncio
+
+nest_asyncio.apply()
+
+# Get the dataset
+from langsmith import Client
+from langsmith.utils import LangSmithError
+
+client = Client()
+
+#-----------------------
+import pandas as pd
+df = pd.read_csv("AI4EIC_sample_dataset.csv", sep = ",")
+
+#---------------------
+# from langchain_openai import OpenAIEmbeddings
+# from langchain_openai import ChatOpenAI
+from langchain_ollama import OllamaEmbeddings, ChatOllama
+from langchain_chroma import Chroma
+from streamlit_app.app_utilities import *
+from streamlit_app.LangChainUtils.LLMChains import *
+from langchain import callbacks
+from langsmith import Client
+from langchain_core.tracers.context import tracing_v2_enabled
+from langchain.callbacks.tracers import LangChainTracer
+from ragas.run_config import RunConfig
+import json
+
+
+# def RunQuery(input_question, max_k, sim_score):
+def RunQuery(input_question, max_k, sim_score,
+             collection_name=None, db_name=None, table_name=None):
+
+    #for generating output form prompt
+    llm = ChatOllama(model="llama3.2:latest", temperature=0, num_predict=4096)
+
+    ## Configure LLM with GPU-optimized parameters
+    # llm = ChatOllama(
+    #     model="llama3.2:latest",
+    #     temperature=0,
+    #     num_predict=4096,
+    #     num_gpu=999,  # Use maximum available GPU layers
+    #     num_threads=8  # Adjust based on your GPU/CPU setup
+    # )
+
+    # embeddings = OpenAIEmbeddings()
+    embeddings = OllamaEmbeddings(model="mxbai-embed-large:latest")
+
+    # Defining some props of DB
+    SimilarityDict = {"Cosine similarity" : "similarity", "MMR" : "mmr"}
+
+    #create an instance of the DB  
+    DBProp = {"PINECONE" : {
+                        "vector_config" : {"db_api_key" :pinecone_api_key, "index_name" : "llm-project", "embedding_function" : embeddings},
+                            "search_config" : {"metric" : sim_score, "search_kwargs" : {"k" : max_k}},
+                            "available_metrics" : ["Cosine similarity", "MMR"]
+                },
+                "CHROMA": {
+                        "vector_config": {"db_name": db_name, "embedding_function": embeddings, "collection_name": collection_name},
+                        "search_config": {"metric": sim_score, "search_kwargs": {"k": max_k}},
+                        "available_metrics": ["Cosine similarity", "MMR"],
+                },
+                "LANCE": {
+                    "vector_config": {"db_name": db_name, "table_name": table_name},
+                    "search_config": {"metric": sim_score, "search_kwargs": {"k": max_k}},
+                    "available_metrics": ["Cosine similarity", "MMR"],
+                },
+            }
+
+    #create an instance of the VectorDB
+    retriever = GetRetriever("CHROMA", DBProp["CHROMA"]["vector_config"], DBProp["CHROMA"]["search_config"])
+    print("output of Getretriever")
+
+    # project name for tracing in Langsmith
+    project_name = f"RAG-CHAT-tapasi"
+
+    # Create a LangChain tracer for tracing the run
+    tracer = LangChainTracer(project_name = project_name)
+    print("out of LangChainTracer")
+
+    run_name = "Evaluation-testings"
+    trace_metadata = {"DBType": "CHROMA", 
+                    "similarity_score": sim_score, 
+                    "max_k": max_k
+                    }
+    RUNCHAIN = RunChatBot(llm, retriever, "../Templates"
+                        ).with_config({"callbacks": [tracer], 
+                                        "run_name": run_name,
+                                        "metadata": trace_metadata}
+                                        )
+    print("out of RunCHatBot")
+    trace_id = ""
+    response = ""
+    runid = ""
+    with tracing_v2_enabled(project_name) as cb:
+        with callbacks.collect_runs() as ccb:
+            output = RUNCHAIN.invoke(input_question)
+
+            ## modify to ensure json format
+            # response = output["answer"]
+            # Ensure output is a JSON-compatible dictionary
+            if isinstance(output, dict) and "answer" in output:
+                response = json.dumps({"answer": output["answer"]})
+            else:
+                response = json.dumps({"answer": str(output)})
+
+            print (output)
+            print (len(ccb.traced_runs))
+            for run in ccb.traced_runs:
+                runid = run.id
+                print (run.name)
+                print (run.id)
+                print (run.inputs)
+                print (run.outputs)
+                print (run.trace_id)
+                trace_id = run.trace_id
+    return response, trace_id, client.share_run(runid)
+
+def RunLLM(input_question, MODEL = "llama3.2:latest"):
+    # model_name = f"gpt-3.5-turbo-1106" if GPTMODEL == 3 else "gpt-4-0125-preview"
+    print (f"input_question, {input_question}")
+    # llm = ChatOpenAI(model_name=model_name, temperature=0,
+    #                 max_tokens = 4096
+    #                 )
+    llm = ChatOllama(model_name=MODEL, temperature=0, num_predict=4096)
+    # # For GPU
+    # llm = ChatOllama(
+    #     model_name=MODEL,
+    #     temperature=0,
+    #     num_predict=4096,
+    #     num_gpu=999  # Maximize GPU usage
+    # )
+    output = llm.invoke(input_question).content
+    ## for json format
+    # return output
+
+    print(f"output of llm : {output}")
+
+
+#---------------------------------------------
+import pickle
+from datasets import Dataset
+
+from langchain_ollama import OllamaEmbeddings, ChatOllama
+from ragas.llms import LangchainLLMWrapper
+from ragas.embeddings import LangchainEmbeddingsWrapper
+# from langchain_openai import OpenAIEmbeddings
+from ragas.metrics import (
+    faithfulness,
+    answer_relevancy,
+    context_precision,
+    context_recall,
+    answer_correctness
+)
+
+import ragas
+'''RAGAS metrics uses openai models by default. So we explicitly define llm and embedding model of ollama and use to
+compute evaluation metrics'''
+
+ANSWER_CORRECTNESS = ragas.metrics.AnswerCorrectness(name = "ANSWER_CORRECTNESS",
+                                                     weights = [0.90, 0.10],
+                                                     )
+ANSWER_RELEVANCY = ragas.metrics.AnswerRelevancy(name = "ANSWER_RELEVANCY",
+                                                strictness = 5,
+                                                )
+CONTEXT_ENTITY_RECALL = ragas.metrics.ContextEntityRecall(name = "CONTEXT_ENTITY_RECALL",
+                                                         )
+CONTEXT_PRECISION = ragas.metrics.ContextPrecision(name = "CONTEXT_PRECISION"
+                                                    )
+CONTEXT_RECALL = ragas.metrics.ContextRecall(name = "CONTEXT_RECALL")
+## new RAGAS doesn't have this evaluation metric                                             
+# CONTEXT_RELEVANCY = ragas.metrics.ContextRelevancy(name = "CONTEXT_RELEVANCY")
+
+FAITHFULNESS = ragas.metrics.Faithfulness(name = "FAITHFULNESS")
+
+# the benchmark Q&A dataset
+import pandas as pd
+df = pd.read_csv("AI4EIC2023_DATASETS.csv", sep = ",")
+
+from ragas import evaluate
+dataset = {"question": [],
+            "answer": [], 
+            "contexts": [],
+            "ground_truth": [],
+            "arxiv_id": [],
+            "input_arxiv_id": [], 
+            "trace_links": []
+            }
+
+# no of chunks to be retrieved
+max_k = 20
+sim_score = "mmr"
+db_name="../ingestion/myChromaDB"
+collection_name = "EIC_archive"
+table_name = "arxiv_table"
+
+if (os.path.exists(f"results_k_{max_k}_sim_{sim_score}.csv")):
+    os.system(f"rm -f results_k_{max_k}_sim_{sim_score}.csv")
+
+for index, row in df.iterrows():
+    question = row["input_question"]
+    answer, trace_id, trace_link = RunQuery(question, max_k, sim_score, db_name=db_name, collection_name=collection_name)
+    print(f" RunQuery : answer : {answer}, trace_id : {trace_id}, trace_link : {trace_link}")
+
+    project_name = f"RAG-CHAT-tapasi"
+    run_name = "Evaluation-testings"
+
+    # if verbose==1:
+    #     print(f"before langsmith is called")
+
+    runs = client.list_runs(project_name = project_name, trace_id = trace_id)
+    print(f"after langsmith client is called : , {runs}")
+    contexts = []
+    cite_arxiv_ids = []
+    for run in runs:
+        if (run.run_type.lower() == "retriever"):
+            print (run.name)
+            print (run.id)
+            print (run.inputs)
+            print (run.outputs)
+            for i in run.outputs['documents']:
+                contexts.append(i["page_content"])
+                cite_arxiv_ids.append(i["metadata"]["arxiv_id"].split("/")[-1].strip())
+            print (run.trace_id)
+            print ("-----")
+
+    dataset["question"].append(question)
+    print (answer.split("http://")[0].strip("\n"))
+    dataset["answer"].append(answer.split("http://")[0].strip("\n"))
+
+    dataset["contexts"].append(contexts)
+    dataset["ground_truth"].append(row["output_complete_response"])
+    dataset["input_arxiv_id"].append(row["input_arxiv_id"])
+    dataset["arxiv_id"].append(cite_arxiv_ids)
+    dataset["trace_links"].append(trace_link)
+
+    with open(f"dataset_k_{max_k}_sim_{sim_score}.pkl", "wb") as f:
+        pickle.dump(dataset, f)
+
+    tmpdataset = {}
+    for key, value in dataset.items():
+        tmpdataset[key] = [value[-1]]
+    DATASET = Dataset.from_dict(tmpdataset)
+
+    # Start time
+    start_time = time.time()
+    print(start_time)
+
+    # Configure run_config with custom timeout
+
+    run_config = RunConfig(
+        timeout=600,           # Set timeout to 10 minutes (600 seconds)
+        max_workers=1     # Sequential processing to avoid Ollama overload
+    )
+
+    result = evaluate(DATASET,
+                  metrics = [
+                    #   FAITHFULNESS,
+                    # #   CONTEXT_RELEVANCY,
+                      CONTEXT_ENTITY_RECALL
+                    #   CONTEXT_PRECISION,
+                    #   CONTEXT_RECALL,
+                    #   ANSWER_RELEVANCY,
+                    #   ANSWER_CORRECTNESS
+                  ],
+                  run_config = run_config
+                  )
+    result_df = result.to_pandas()
+    if (os.path.exists(f"results_k_{max_k}_sim_{sim_score}.csv")):
+        df = pd.read_csv(f"results_k_{max_k}_sim_{sim_score}.csv", sep = ",")
+        result_df = pd.concat([df, result_df])
+    result_df.to_csv(f"results_k_{max_k}_sim_{sim_score}.csv", index = False)
+
+    # End time
+    end_time = time.time()
+    delta_time = end_time - start_time
+    print(f"time taken : {delta_time}")
+
+
+# -------------------------------------------------------
+
+    # import asyncio
+    # from ragas import evaluate
+
+    # async def run_evaluation(dataset, metrics):
+    #     return evaluate(dataset, metrics=metrics)
+
+    # try:
+    #     result = asyncio.run(asyncio.wait_for(
+    #         run_evaluation(
+    #             DATASET,
+    #             [
+    #                 FAITHFULNESS,
+    #                 CONTEXT_ENTITY_RECALL,
+    #                 CONTEXT_PRECISION,
+    #                 CONTEXT_RECALL,
+    #                 ANSWER_RELEVANCY,
+    #                 ANSWER_CORRECTNESS
+    #             ]
+    #         ),
+    #         timeout=600  # 300 seconds
+    #     ))
+    #     result_df = result.to_pandas()
+    #     if os.path.exists(f"results_k_{max_k}_sim_{sim_score}.csv"):
+    #         df = pd.read_csv(f"results_k_{max_k}_sim_{sim_score}.csv", sep=",")
+    #         result_df = pd.concat([df, result_df])
+    #     result_df.to_csv(f"results_k_{max_k}_sim_{sim_score}.csv", index=False)
+    # except asyncio.TimeoutError:
+    #     print(f"Evaluation timed out after 300 seconds")
+    #     continue
diff --git a/Templates/reponse_01.template b/Templates/reponse_01.template
@@ -27,6 +27,7 @@ Here is the response template that you need strictly follow:
 - End with a closing remark and a list of sources with their respective URLs as a bullet list explicitly with full links which are enclosed in the tag <ARXIV_ID> and </ARXIV_ID> respectively.
 - Your references have to strictly follow the `Example response` as template.
 - Strictly use the styling of response based on the `Example response`.
+
 ---
 
 Here is how an response would look like. Reproduce the same format for your response:
@@ -72,5 +73,13 @@ Make sure these citations have to be relavant and strictly do not repeat the con
 REMEMBER: If there is no relevant information within the context, just say "Hmm, I'm \
 not sure." or greet back. Don't try to make up an answer. Anything between the preceding 'context' \
 html blocks is retrieved from a knowledge bank, not part of the conversation with the \
-user.\
+user.
+
+----
+Strictly note the points below 
+- You must always return valid JSON fenced by a markdown code block. Do not return any additional text.
+- Do not add any text outside the JSON code block.
+- Escape special characters (e.g., quotes, backslashes) to ensure valid JSON.
+\
+
 Question: {question}
diff --git a/ingestion/README.md b/ingestion/README.md
@@ -21,3 +21,8 @@ When approaching this within naive RAG framework. One can find intelligent ways
 
 ## Modular RAG System
 
+### To generate sample vectorDB after downloading sample arxive file
+- `python download_arxiv.py -i test_arxiv_sources.info -o downloaded_files` 
+
+### to run the `ingest.py` file
+