diff --git a/README.md b/README.md index 86ee7368..9ffd7ae0 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ Built on open-source principles, the framework guides users through essential st ✅ Analysing source code complexity using [Wily](https://wily.readthedocs.io/en/latest/index.html)\ ✅ Web UI build on [Flask](https://flask.palletsprojects.com/en/3.0.x/) \ ✅ Web UI re-done and expanded with [FastHTML](https://docs.fastht.ml/)\ +✅ Leverage AI models to analyse data [GitHub AI models Beta](https://docs.github.com/en/github-models/prototyping-with-ai-models) ### 📊 Repository stats @@ -123,5 +124,3 @@ INFO: Waiting for application startup. INFO: Application startup complete. ``` ![Screenshot 2024-07-31 at 4 42 44 PM](https://github.com/user-attachments/assets/a1c977c9-1698-416c-8ac3-15fdbffa0b0a) - - diff --git a/analytics_framework/ai_modeling/analyse_my_data.py b/analytics_framework/ai_modeling/analyse_my_data.py new file mode 100644 index 00000000..f06e68b0 --- /dev/null +++ b/analytics_framework/ai_modeling/analyse_my_data.py @@ -0,0 +1,67 @@ +import os +import pandas as pd +from openai import OpenAI +import intake +from analytics_framework import INTAKE_LOC +from pathlib import Path + +# Data read via intake catalog +CATALOG_LOC = Path.joinpath(INTAKE_LOC, "catalog_entry.yml") +catalog = intake.open_catalog(CATALOG_LOC) + +# Load the token and endpoint from environment variables +token = os.environ["GITHUB_TOKEN"] +endpoint = "https://models.inference.ai.azure.com" +model_name = "gpt-4o-mini" + +# Initialize OpenAI client +client = OpenAI( + base_url=endpoint, + api_key=token, +) + + +def analyze_data(intake_catalog_entry): + # Load the data via intake + try: + df_input = catalog[intake_catalog_entry].read() + print(f"Data loaded successfully {df_input.head()}") + except Exception as e: + print(f"Error loading data: {e}") + return + + # Prepare the data for analysis (simple description of the dataset) + summary = df_input.describe().to_string() + + # Create the system and user messages for the model + messages = [ + { + "role": "system", + "content": "You are a helpful assistant skilled in analyzing data.", + }, + { + "role": "user", + "content": f"Here is a summary of my data:\n{summary}\nProvide an analysis of this dataset, " + f"display in html format along with the dataset provided.", + } + ] + + # Generate a response from the GPT-4 model + try: + response = client.chat.completions.create( + messages=messages, + model=model_name, + temperature=1.0, + max_tokens=1000, + top_p=1.0 + ) + + # Output the analysis from the model + print(response.choices[0].message.content) + except Exception as e: + print(f"Error generating response: {e}") + + +# Example usage +intake_catalog_entry = "address_sample" +analyze_data(intake_catalog_entry) diff --git a/analytics_framework/ai_modeling/output_genrated.html b/analytics_framework/ai_modeling/output_genrated.html new file mode 100644 index 00000000..fab65241 --- /dev/null +++ b/analytics_framework/ai_modeling/output_genrated.html @@ -0,0 +1,84 @@ + + +
+ + +Statistic | +Value | +
---|---|
Count | +5 | +
Mean | +21769.80 | +
Standard Deviation | +39059.21 | +
Minimum | +123 | +
25th Percentile | +298 | +
Median (50th Percentile) | +8075 | +
75th Percentile | +9119 | +
Maximum | +91234 | +
The dataset consists of 5 observations. The mean value is significantly skewed by a few extreme values, particularly the maximum value of 91234, which is substantially higher than the other values. The standard deviation (39059.21) indicates high variability in the data.
+Looking at the spread of the data:
+This indicates that while there are some higher values, they are outliers compared to the rest of the data. Such outliers can affect overall analysis and should be treated accordingly depending on the context of the study.
+ + + diff --git a/environment.yml b/environment.yml index 50326ab0..866ef20e 100644 --- a/environment.yml +++ b/environment.yml @@ -31,3 +31,4 @@ dependencies: - pip: - mitoinstaller - quarto + - openai