-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
88 lines (63 loc) · 3.01 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from __future__ import annotations
from config import Config
from pathlib import Path
from generator import generate_combinations, Triplet
from linear_regression import MyLinearRegression
from dotenv import load_dotenv
import pickle
from tqdm import tqdm
load_dotenv()
sdk = Config.sdk
pandas = Config.pandas
def set_up() -> None:
sdk.catalog_data_source.load_and_put_declarative_data_sources(credentials_path=Path("credentials.yaml"),
test_data_sources=True)
sdk.catalog_workspace.load_and_put_declarative_workspaces()
def cache_combinations(workspace_id: str = Config.workspace_id, name: str = "combinations.pickle") -> None:
combinations = generate_combinations(sdk, workspace_id)
with open(name, "wb") as f:
pickle.dump(combinations, f)
def load_combinations(name: str = "combinations.pickle") -> set[Triplet]:
with open(name, "rb") as f:
return pickle.load(f)
def run(generate: bool = False, workspace_id: str = Config.workspace_id) -> list[MyLinearRegression]:
"""
:param generate: Flag to trigger generation. If it is not defined then cached combinations are used
:param workspace_id: The id of the workspace we want to explore
:return: Sorted list of MyLinearRegression objects.
Assumptions of these objects can be easily re-run with different values.
"""
pandas_df = pandas.data_frames(workspace_id)
if generate:
combinations = generate_combinations(sdk, workspace_id)
else:
combinations = load_combinations("combinations.pickle")
regressions = []
# create object that perform linear regression
print("Creating linear regression objects...")
for combination in tqdm(combinations):
regression = MyLinearRegression(pandas_df, combination)
regressions.append(regression)
# trigger assumptions check
print("Running linear regression assumptions check...")
for regression in tqdm(regressions):
regression.check_assumptions()
# sort results from the best to the worst
# sorting can be customized i.e. sum of absolute difference between thresholds
regressions = [r for r in regressions if r.assumptions_results]
regressions.sort(key=lambda x: x.valid_assumptions_count, reverse=True)
return regressions
def main():
# This will set up whole GoodData CN environment for you
set_up()
# This will generate or load from cache all combinations and perform linear regression on them. The output is a
# sorted list of linear regression, where the first object is the best pick for linear regression (passes the
# most assumptions) and the last is the worst pick for linear regression.
linear_regressions = run()
# Have a look at the best linear regression.
linear_regressions[0].visualize()
# Have a look at the worst linear regression.
linear_regressions[-1].visualize()
# You are more than welcome to further examine these objects and check assumptions.
if __name__ == '__main__':
main()