diff --git a/examples/india_box_plots.py b/examples/india_box_plots.py new file mode 100644 index 0000000..0a0a057 --- /dev/null +++ b/examples/india_box_plots.py @@ -0,0 +1,120 @@ +"""Box plots of max cumulative energy generation per system""" +import h5py +import numpy as np +import pandas as pd +import plotly.graph_objects as go + +# load hdf file with the generation data for each system +pv_systems_hdf = os.environ.get("PV_DATA_HDF") + +systems_with_data = [ + "56151", + "56709", + "58780", + "59687", + "59710", + "60294", + "60602", + "60673", + "66634", + "67861", + "71120", + "72742", + "73347", + "77684", + "77710", + "78186", + "79612", + "81408", + "82081", + "85738", + "86244", + "87410", + "90559", + "91554", + "97094", + "99833", +] + + +pv_systems = [] + +with h5py.File(pv_systems_hdf, "r") as f: + for system_id in systems_with_data: + df_pv_system = pd.DataFrame(np.array(f["timeseries"][system_id]["table"])) + df_pv_system["index"] = pd.to_datetime(df_pv_system["index"], unit="ns") + df_pv_system = df_pv_system.groupby(pd.Grouper(key="index", freq="D")).max() + df_pv_system["System ID"] = system_id + df_pv_system = pd.DataFrame(df_pv_system, columns=["cumulative_energy_gen_Wh", "System ID"]) + df_pv_system["cumulative_energy_gen_kWh"] = ( + df_pv_system["cumulative_energy_gen_Wh"] / 1000 + ).astype(float) + pv_systems.append(df_pv_system) + + +fig = go.Figure() +for pv_system in pv_systems: + fig.add_trace( + go.Box( + y=pv_system["cumulative_energy_gen_kWh"], + x=pv_system["System ID"], + name=pv_system["System ID"][0], + boxpoints="suspectedoutliers", + jitter=0.5, + whiskerwidth=0.2, + fillcolor="rgba(93, 164, 214, 0.5)", + marker_size=2, + line_width=1, + ) + ) + fig.update_layout( + title="Daily max values of cumulative energy generation per system", + yaxis=dict( + autorange=True, + showgrid=True, + zeroline=True, + gridcolor="rgb(255, 255, 255)", + gridwidth=1, + zerolinecolor="rgb(255, 255, 255)", + zerolinewidth=2, + ), + ), + + y_data = df_pv_system["cumulative_energy_gen_kWh"] + x_data = df_pv_system["System ID"] + + colors = [ + "rgba(93, 164, 214, 0.5)", + "rgba(255, 144, 14, 0.5)", + "rgba(44, 160, 101, 0.5)", + "rgba(255, 65, 54, 0.5)", + "rgba(207, 114, 255, 0.5)", + "rgba(127, 96, 0, 0.5)", + ] + + margin = ( + dict( + l=40, + r=30, + b=80, + t=100, + ), + ) + paper_bgcolor = ("rgb(243, 243, 243)",) + plot_bgcolor = ("rgb(243, 243, 243)",) + showlegend = (False,) + title = ("Daily Max Production per System for India",) + xaxis = ( + dict( + autorange=True, + showgrid=True, + zeroline=True, + dtick=5, + gridcolor="rgb(255, 255, 255)", + gridwidth=1, + zerolinecolor="rgb(255, 255, 255)", + zerolinewidth=2, + ), + ) + +fig.show() diff --git a/examples/india_chart_per_system.py b/examples/india_chart_per_system.py new file mode 100644 index 0000000..7dc8667 --- /dev/null +++ b/examples/india_chart_per_system.py @@ -0,0 +1,62 @@ +"""Chart generation for a single system. This shows the cumulative energy +generation or instantaneous power generation over time. Some of the commented out code can be used to filter the data by date or plot max, mean, min, median values, etc. +""" +import os + +import h5py +import numpy as np +import pandas as pd +import plotly.express as px + +# load hdf file with the generation data for each system +pv_systems_hdf = os.environ.get("PV_DATA_HDF") + +# this plots one system as a line chart in plotly +# plot the data as line graph in plotly + +# can choose a system id to input here +system_id = "SYSTEM_ID" +# read the hdf file and get the data for the system id +with h5py.File(pv_systems_hdf, "r") as f: + pv_system_data = pd.DataFrame(np.array(f["timeseries"][system_id]["table"])) + pv_system_data["index"] = pd.to_datetime(pv_system_data["index"], unit="ns") + pv_system_data = pv_system_data.set_index("index", inplace=False) + # this code can be used to get the mean weekly ("W") or monthly ("M") + # production per system or the max daily production ("D") + # pv_system_data = pv_system_data.groupby(pd.Grouper(key="index", freq="M").mean + + pv_system_data["System ID"] = system_id + pv_system_data = pd.DataFrame( + pv_system_data, + columns=[ + "index", + "cumulative_energy_gen_Wh", + "System ID", + "instantaneous_power_gen_W", + ], + ) + # filter the data by date + # pv_system_data = pv_system_data.loc[ + # pv_system_data.index > pd.Timestamp("2019-05-01 00:00:00") + # ] + # pv_system_data = pv_system_data.loc[ + # pv_system_data.index < pd.Timestamp("2019-07-01 00:00:00") + # ] + + pv_system_data["cumulative_energy_gen_kWh"] = ( + pv_system_data["cumulative_energy_gen_Wh"] / 1000 + ).astype(float) + + # plot the data as a line chart + fig = px.line( + pv_system_data, + x=pv_system_data.index, + # here you can swap between cumulative energy generation and instantaneous + # power generation on the y axis + y=pv_system_data["cumulative_energy_gen_kWh"], + title=f"Generation for System ID: {system_id}", + ) + fig.update_layout( + xaxis_title="Time", + ) + fig.show() diff --git a/examples/india_gantt_chart.py b/examples/india_gantt_chart.py new file mode 100644 index 0000000..c1654b7 --- /dev/null +++ b/examples/india_gantt_chart.py @@ -0,0 +1,82 @@ +"""Gantt chart for India PV systems. This shows where there are gaps in the data.""" +import os + +import h5py +import numpy as np +import pandas as pd +import plotly.express as px + +# load hdf file with the generation data for each system +pv_data_hdf = os.environ.get("PV_DATA_HDF") + +# these are the current systems with some data in the hdf file for india +systems_with_data = [ + "56151", + "56709", + "58780", + "59687", + "59710", + "60294", + "60602", + "66634", + "71120", + "72742", + "73347", + "77684", + "77710", + "78186", + "79612", + "81408", + "82081", + "85738", + "86244", + "87410", + "90559", + "91554", + "97094", + "99833", + "100451", +] + + +pv_systems = [] + +# read the hdf file and get start and end dates of available data per site +with h5py.File(pv_data_hdf, "r") as f: + # loop through each pv system in the hdf file + for system_id in systems_with_data: + df = pd.DataFrame(np.array(f["timeseries"][system_id]["table"])) + df["index"] = pd.to_datetime(df["index"], unit="ns") + df = df[df["index"] > pd.Timestamp("2018-01-01")] + # set a value for the end date otherwise it registers as NaT + end_date = df["index"].iloc[-1] + df["index_difference"] = df["index"].diff() + # get startpoints of gaps in the data + df = df[df["index_difference"] > pd.Timedelta("1D")] + # get endpoints of gaps by looking at the difference between indexes + df["previous_endpoint"] = df["index"] - df["index_difference"] + df["endpoints"] = df["previous_endpoint"].shift(-1) + # set the last endpoint to the end date otherwise it registers as NaT + if len(df["endpoints"]) > 0: + df["endpoints"].iloc[-1] = end_date + + # make a dictionary for the gantt chart to plot + # loop over the start and end dates and add to start_end_data dictionary + for index, row in df.iterrows(): + start_end_data = {} + start_end_data["System ID"] = system_id + start_end_data["Start"] = row["index"] + start_end_data["Finish"] = row["endpoints"] + + pv_systems.append(start_end_data) + + # plot the data as gantt chart in plotly + fig = px.timeline( + pv_systems, + x_start="Start", + x_end="Finish", + y="System ID", + color="System ID", + title="Gantt Chart of PV Systems in India", + ) + fig.show() diff --git a/examples/india_map.py b/examples/india_map.py new file mode 100644 index 0000000..c849402 --- /dev/null +++ b/examples/india_map.py @@ -0,0 +1,27 @@ +""" Example of plotting PVOutput India system locations on a map.""" +import pandas as pd +import plotly.express as px + +# load csv file with system metadata +pv_system_metadata = "./examples/pv_data/PVOutput_India_systems.csv" +# pv_system_metadata = os.environ.get("PV_SYSTEM_FILE") +pv_system_metadata = pd.read_csv(pv_system_metadata) +pv_systems_lat_lon = pd.DataFrame( + pv_system_metadata, columns=["system_id", "latitude", "longitude", "system_size_W"] +) +# remove systems that don't have a lat/lon coordinate +if pv_systems_lat_lon["latitude"].isnull().values.any(): + pv_systems_lat_lon = pv_systems_lat_lon.dropna() + + +fig = px.scatter_geo( + pv_systems_lat_lon, + lat="latitude", + lon="longitude", + size="system_size_W", + color="system_size_W", + hover_name="system_id", + scope="asia", + title="PVOutput India System Locations", +) +fig.show() diff --git a/examples/india_mean_production.py b/examples/india_mean_production.py new file mode 100644 index 0000000..0b461ca --- /dev/null +++ b/examples/india_mean_production.py @@ -0,0 +1,103 @@ +"""Mean Production per System by time interval (month, week,etc.) +This example shows the mean daily production per system by month. +This makes a plot with 2 columns and half as many rows as there are systems with data. +""" +import os + +import h5py +import numpy as np +import pandas as pd +import plotly.graph_objects as go +from plotly.subplots import make_subplots + +# load hdf file with the generation data for each system +pv_data_hdf_file = os.environ.get("SYSTEM_DATA") + +# these are the current systems with data in the hdf file for India +systems_with_data = [ + "56151", + "56709", + "58780", + "59687", + "59710", + "60294", + "60602", + "60673", + "66634", + "67861", + "71120", + "72742", + "73347", + "77684", + "77710", + "78186", + "79612", + "81408", + "82081", + "85738", + "86244", + "87410", + "90559", + "91554", + "97094", + "99833", +] + + +# for the subplot titles, this function is used to get the row number +def row(row): + for row in range(0, len(pv_systems)): + if i == 1: + row = 1 + elif i % 2 == 0: + row = int(i / 2) + else: + row = int((i + 1) / 2) + return row + + +pv_systems = [] +# read the hdf file +with h5py.File(pv_data_hdf_file, "r") as f: + # loop through each pv system in the hdf file. some of the lines are commented out but can + # be used to filter the data by date or to get the mean weekly production per system + for system_id in systems_with_data: + df = pd.DataFrame(np.array(f["timeseries"][system_id]["table"])) + df["index"] = pd.to_datetime(df["index"], unit="ns") + # df["index"] = df[df["index"] > pd.Timestamp("2019-01-01")] + df_pv_system = df.groupby(pd.Grouper(key="index", freq="M")).mean() + df_pv_system["System ID"] = system_id + df_pv_system = pd.DataFrame(df_pv_system, columns=["cumulative_energy_gen_Wh", "System ID"]) + # convert Wh to kWh + df_pv_system["cumulative_energy_gen_kWh"] = ( + df_pv_system["cumulative_energy_gen_Wh"] / 1000 + ).astype(float) + pv_systems.append(df_pv_system) + i = 1 + # make the plot with subplots + fig = make_subplots( + rows=len(pv_systems), + cols=2, + shared_xaxes=False, + horizontal_spacing=0.2, + vertical_spacing=0.02, + subplot_titles=[system_id for system_id in systems_with_data], + ) + # loop through each system and add a line to the subplot + for i in range(1, len(pv_systems)): + if len(pv_systems[i - 1]) > 0: + fig.add_trace( + go.Scatter( + x=pv_systems[i - 1].index, + y=pv_systems[i - 1]["cumulative_energy_gen_Wh"], + name=pv_systems[i - 1]["System ID"][0], + mode="lines", + ), + row=row(i), + col=[2 if i % 2 == 0 else 1], + ) + i += 1 + fig.update_yaxes(title_text="kWh") + fig.update_layout(height=3000, width=750, title_text="Mean Monthly Production per System") + fig.update_annotations(font_size=12) + fig.show() diff --git a/examples/india_system_capacity_histogram.py b/examples/india_system_capacity_histogram.py new file mode 100644 index 0000000..fda4c2e --- /dev/null +++ b/examples/india_system_capacity_histogram.py @@ -0,0 +1,27 @@ +"""Histogram of system capacities for India.""" +import os + +import pandas as pd +import plotly.express as px + +# load csv file with system metadata +pv_systems_metadata = os.environ.get("PV_SYSTEM_METADATA") + +# read the csv file and build a dataframe +data = pd.read_csv(pv_systems_metadata) +pv_metadata = pd.DataFrame(data, columns=["system_id", "system_size_W"]) +pv_metadata["system_id"] = pv_metadata["system_id"].astype(str) +pv_metadata["system_size_W"] = (pv_metadata["system_size_W"] / 1000).astype(float) +pv_metadata.rename( + columns={"system_id": "System ID", "system_size_W": "System Capacity (kW)"}, + inplace=True, +) + +# plot the data as histogram in plotly +fig = px.histogram( + pv_metadata, + x="System ID", + y="System Capacity (kW)", + title="PVOutput India System Capacities", +) +fig.show() diff --git a/pvoutput/mapscraper.py b/pvoutput/mapscraper.py index c64f1f1..9c0c8d1 100644 --- a/pvoutput/mapscraper.py +++ b/pvoutput/mapscraper.py @@ -350,6 +350,7 @@ def clean_soup(soup): """Function to clean scraped soup object. Note that the downloaded soup could change over time. + Args: soup: bs4.BeautifulSoup diff --git a/pvoutput/pvoutput.py b/pvoutput/pvoutput.py index 0e4ea35..6993985 100644 --- a/pvoutput/pvoutput.py +++ b/pvoutput/pvoutput.py @@ -924,7 +924,7 @@ def _download_multiple_worker( ) else: total_rows += len(timeseries) - _LOG.info(f'Adding timezone {timezone} to {total_rows} rows') + _LOG.info(f"Adding timezone {timezone} to {total_rows} rows") timeseries = timeseries.tz_localize(timezone) _LOG.info( "system_id: %d: %d rows retrieved: %s to %s", diff --git a/scripts/fetch_pv_timeseries.py b/scripts/fetch_pv_timeseries.py index ac7d4a6..39b6eda 100644 --- a/scripts/fetch_pv_timeseries.py +++ b/scripts/fetch_pv_timeseries.py @@ -21,14 +21,15 @@ or create and use a ~/.pvoutput.yml file as described in the PVOutput library documentation """ -from pvoutput import * - -import click as cl import datetime as dt +import logging +import pathlib import sys + +import click as cl import pandas as pd -import pathlib -import logging + +from pvoutput import * @cl.command()