From 0474aff9fff6c80bc6e75b51b739dc2e3b201962 Mon Sep 17 00:00:00 2001 From: NevilParikh14 Date: Mon, 3 Oct 2022 15:29:31 +0530 Subject: [PATCH 01/22] Initial commit --- .circleci/config.yml | 14 +- tap_mixpanel/__init__.py | 75 +-- tap_mixpanel/client.py | 302 ++++++----- tap_mixpanel/discover.py | 28 +- tap_mixpanel/schema.py | 149 +++--- tap_mixpanel/streams.py | 454 ++++++++++++----- tap_mixpanel/sync.py | 26 +- tap_mixpanel/transform.py | 117 ++++- tests/configuration/fixtures.py | 8 +- tests/tap_tester/base.py | 244 +++++---- .../tap_tester/test_all_fields_pagination.py | 163 ------ tests/tap_tester/test_discovery.py | 142 ------ tests/tap_tester/test_mixpanel_all_fields.py | 125 +++++ ...s.py => test_mixpanel_automatic_fields.py} | 35 +- ..._bookmark.py => test_mixpanel_bookmark.py} | 100 ++-- tests/tap_tester/test_mixpanel_discovery.py | 190 +++++++ tests/tap_tester/test_mixpanel_pagination.py | 107 ++++ ...rt_date.py => test_mixpanel_start_date.py} | 148 +++--- tests/unittests/test_error_handling.py | 472 ++++++------------ tests/unittests/test_medium_client.py | 178 ++++--- .../test_request_timeout_param_value.py | 162 +++--- tests/unittests/test_support_eu_endpoints.py | 228 ++++++--- tests/unittests/test_transform_event_times.py | 44 +- 23 files changed, 2040 insertions(+), 1471 deletions(-) delete mode 100644 tests/tap_tester/test_all_fields_pagination.py delete mode 100644 tests/tap_tester/test_discovery.py create mode 100644 tests/tap_tester/test_mixpanel_all_fields.py rename tests/tap_tester/{test_automatic_fields.py => test_mixpanel_automatic_fields.py} (67%) rename tests/tap_tester/{test_bookmark.py => test_mixpanel_bookmark.py} (69%) create mode 100644 tests/tap_tester/test_mixpanel_discovery.py create mode 100644 tests/tap_tester/test_mixpanel_pagination.py rename tests/tap_tester/{test_start_date.py => test_mixpanel_start_date.py} (50%) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0865a7d..d60c08a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -15,12 +15,11 @@ jobs: pip install -U 'pip<19.2' 'setuptools<51.0.0' pip install .[dev] pip install pytest-cov - # TODO: Fails pylint a lot, skipping for now (https://stitchdata.atlassian.net/browse/SRCE-4606) - #- run: - # name: 'pylint tap' - # command: | - # source /usr/local/share/virtualenvs/tap-mixpanel/bin/activate - # pylint tap_mixpanel -d 'broad-except,chained-comparison,empty-docstring,fixme,invalid-name,line-too-long,missing-class-docstring,missing-function-docstring,missing-module-docstring,no-else-raise,no-else-return,too-few-public-methods,too-many-arguments,too-many-branches,too-many-lines,too-many-locals,ungrouped-imports,wrong-spelling-in-comment,wrong-spelling-in-docstring,too-many-public-methods' + - run: + name: 'pylint tap' + command: | + source /usr/local/share/virtualenvs/tap-mixpanel/bin/activate + pylint tap_mixpanel -d 'broad-except,chained-comparison,empty-docstring,fixme,invalid-name,line-too-long,missing-module-docstring,no-else-raise,no-else-return,too-few-public-methods,too-many-arguments,too-many-branches,too-many-lines,too-many-locals,ungrouped-imports,too-many-public-methods,protected-access,too-many-statements,not-an-iterable' - run: name: 'JSON Validator' command: | @@ -30,7 +29,8 @@ jobs: name: 'Unit Tests' command: | source /usr/local/share/virtualenvs/tap-mixpanel/bin/activate - python -m pytest --junitxml=junit/test-result.xml --cov=tap_mixpanel --cov-report=html tests/unittests/ + pip install coverage parameterized + python -m pytest --junitxml=junit/test-result.xml --cov=tap_mixpanel --cov-report=html tests/unittests/ - store_test_results: path: test_output/report.xml - store_artifacts: diff --git a/tap_mixpanel/__init__.py b/tap_mixpanel/__init__.py index 802d95a..00c5e40 100644 --- a/tap_mixpanel/__init__.py +++ b/tap_mixpanel/__init__.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 -import sys import json -import argparse -from datetime import datetime, timedelta, date +import sys +from datetime import timedelta + import singer -from singer import metadata, utils -from singer.utils import strptime_to_utc, strftime +from singer import utils +from singer.utils import strftime, strptime_to_utc + from tap_mixpanel.client import MixpanelClient from tap_mixpanel.discover import discover from tap_mixpanel.sync import sync @@ -15,29 +16,39 @@ REQUEST_TIMEOUT = 300 REQUIRED_CONFIG_KEYS = [ - 'project_timezone', - 'api_secret', - 'attribution_window', - 'start_date', - 'user_agent' + "project_timezone", + "api_secret", + "attribution_window", + "start_date", + "user_agent", ] def do_discover(client, properties_flag): - LOGGER.info('Starting discover') + """Call the discovery function. + + Args: + client (MixpanelClient): Client object to make http calls. + properties_flag (str): Setting this argument to `true` ensures that new properties on + events and engage records are captured. + """ + LOGGER.info("Starting discover") catalog = discover(client, properties_flag) json.dump(catalog.to_dict(), sys.stdout, indent=2) - LOGGER.info('Finished discover') + LOGGER.info("Finished discover") @singer.utils.handle_top_exception(LOGGER) def main(): + """ + Run discover mode or sync mode. + """ parsed_args = singer.utils.parse_args(REQUIRED_CONFIG_KEYS) - start_date = parsed_args.config['start_date'] + start_date = parsed_args.config["start_date"] # Set request timeout to config param `request_timeout` value. # If value is 0, "0", "" or not passed then it sets default to 300 seconds. - config_request_timeout = parsed_args.config.get('request_timeout') + config_request_timeout = parsed_args.config.get("request_timeout") if config_request_timeout and float(config_request_timeout): request_timeout = float(config_request_timeout) else: @@ -45,43 +56,47 @@ def main(): start_dttm = strptime_to_utc(start_date) now_dttm = utils.now() - if parsed_args.config.get('end_date'): - now_dttm = strptime_to_utc(parsed_args.config.get('end_date')) + if parsed_args.config.get("end_date"): + now_dttm = strptime_to_utc(parsed_args.config.get("end_date")) delta_days = (now_dttm - start_dttm).days if delta_days >= 365: delta_days = 365 start_date = strftime(now_dttm - timedelta(days=delta_days)) - LOGGER.warning("start_date greater than 1 year maxiumum for API.") + LOGGER.warning("start_date greater than 1 year maximum for API.") LOGGER.warning("Setting start_date to 1 year ago, %s", start_date) - #Check support for EU endpoints - if str(parsed_args.config.get('eu_residency')).lower() == "true": + # Check support for EU endpoints + if str(parsed_args.config.get("eu_residency")).lower() == "true": api_domain = "eu.mixpanel.com" else: api_domain = "mixpanel.com" - with MixpanelClient(parsed_args.config['api_secret'], - api_domain, - request_timeout, - parsed_args.config['user_agent']) as client: + with MixpanelClient( + parsed_args.config["api_secret"], + api_domain, + request_timeout, + parsed_args.config["user_agent"], + ) as client: state = {} if parsed_args.state: state = parsed_args.state config = parsed_args.config - properties_flag = config.get('select_properties_by_default') + properties_flag = config.get("select_properties_by_default") if parsed_args.discover: client.__api_domain = api_domain do_discover(client, properties_flag) elif parsed_args.catalog: - sync(client=client, - config=config, - catalog=parsed_args.catalog, - state=state, - start_date=start_date) + sync( + client=client, + config=config, + catalog=parsed_args.catalog, + state=state, + start_date=start_date, + ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tap_mixpanel/client.py b/tap_mixpanel/client.py index 7f3d254..5bc97dc 100644 --- a/tap_mixpanel/client.py +++ b/tap_mixpanel/client.py @@ -1,11 +1,10 @@ import base64 -import io import backoff import jsonlines import requests import singer -from requests.exceptions import ConnectionError, Timeout +from requests.exceptions import Timeout from singer import metrics LOGGER = singer.get_logger() @@ -15,112 +14,127 @@ class ReadTimeoutError(Exception): - pass + """Custom error for request timeout.""" class Server5xxError(Exception): - pass + """Custom error class for all the 5xx error.""" class Server429Error(Exception): - pass + """Custom error class for rate limit exceeded.""" class MixpanelError(Exception): - pass + """Custom error class for all the Mixpanel errors.""" class MixpanelBadRequestError(MixpanelError): - pass + """Custom error class for bad request.""" class MixpanelUnauthorizedError(MixpanelError): - pass + """Custom error class for authorization.""" class MixpanelPaymentRequiredError(MixpanelError): - pass + """Custom error if API call require payment.""" class MixpanelNotFoundError(MixpanelError): - pass + """Custom error class for not found error.""" class MixpanelForbiddenError(MixpanelError): - pass + """Custom error class for forbidden error.""" class MixpanelInternalServiceError(Server5xxError): - pass + """Custom error class for internal server error.""" +# Custom errors with respective messages mapped by error code. ERROR_CODE_EXCEPTION_MAPPING = { 400: { "raise_exception": MixpanelBadRequestError, - "message": "A validation exception has occurred." + "message": "A validation exception has occurred.", }, 401: { "raise_exception": MixpanelUnauthorizedError, - "message": "Invalid authorization credentials." + "message": "Invalid authorization credentials.", }, 402: { "raise_exception": MixpanelPaymentRequiredError, - "message": "Your current plan does not allow API calls. Payment is required to complete the operation." + "message": "Your current plan does not allow API calls. Payment is required to complete the operation.", }, 403: { "raise_exception": MixpanelForbiddenError, - "message": "User does not have permission to access the resource." + "message": "User does not have permission to access the resource.", }, 404: { "raise_exception": MixpanelNotFoundError, - "message": "The resource you have specified cannot be found." + "message": "The resource you have specified cannot be found.", }, 429: { "raise_exception": Server429Error, - "message": "The API rate limit for your organisation/application pairing has been exceeded." + "message": "The API rate limit for your organization/application pairing has been exceeded.", }, 500: { "raise_exception": MixpanelInternalServiceError, - "message": "Server encountered an unexpected condition that prevented it from fulfilling the request." - } + "message": "Server encountered an unexpected condition that prevented it from fulfilling the request.", + }, } + def raise_for_error(response): - LOGGER.error('ERROR %s: %s, REASON: %s', response.status_code, - response.text, - response.reason) + """Retrieve the error code and the error message from the response + and raises custom exceptions accordingly. + + Args: + response (requests.Response): Response with error code. + + Raises: + exc: Custom exception prepared according to status code. + """ + LOGGER.error( + "ERROR %s: %s, REASON: %s", response.status_code, response.text, response.reason + ) try: response_json = response.json() except Exception: response_json = {} error_code = response.status_code error_message = response_json.get( - "error", response_json.get( - "message", ERROR_CODE_EXCEPTION_MAPPING.get( - error_code, {}).get( - "message", "Unknown Error"))) - - # if response text contains something unusual error of to_date then provide helper message of timezone mismatch + "error", + response_json.get( + "message", + ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get( + "message", "Unknown Error" + ), + ), + ) + + # If response text contains something unusual error of to_date then provide helper message of timezone mismatch # E.g error: to_date cannot be later than today if error_code == 400: if "to_date" in response.text: error_message += " Please validate the timezone with the MixPanel UI under project settings." else: - error_message = '{}(Please verify your credentials.)'.format(error_message) + error_message = f"{error_message}(Please verify your credentials.)" - message = "HTTP-error-code: {}, Error: {}".format(error_code, error_message) + message = f"HTTP-error-code: {error_code}, Error: {error_message}" - exc = ERROR_CODE_EXCEPTION_MAPPING.get( - error_code, {}).get("raise_exception", MixpanelError) + exc = ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get( + "raise_exception", MixpanelError + ) raise exc(message) from None -class MixpanelClient(object): - def __init__(self, - api_secret, - api_domain, - request_timeout, - user_agent=None): +class MixpanelClient: + """ + The client class used for making REST calls to the Mixpanel API. + """ + def __init__(self, api_secret, api_domain, request_timeout, user_agent=None): self.__api_secret = api_secret self.__api_domain = api_domain self.__request_timeout = request_timeout @@ -136,63 +150,95 @@ def __enter__(self): def __exit__(self, exception_type, exception_value, traceback): self.__session.close() - @backoff.on_exception(backoff.expo, - (Server5xxError, Server429Error, ReadTimeoutError, ConnectionError, Timeout), - max_tries=5, - factor=2) + @backoff.on_exception( + backoff.expo, + (Server5xxError, Server429Error, ReadTimeoutError, ConnectionError, Timeout), + max_tries=5, + factor=2, + ) def check_access(self): + """Call rest API to verify user's credentials. + + Raises: + Exception: Raises if response is not success. + ReadTimeoutError: Raises if requests timeout. + + Returns: + bool: Returns true if credentials are verified. + (else raises Exception) + """ if self.__api_secret is None: - raise Exception('Error: Missing api_secret in tap config.json.') + raise Exception("Error: Missing api_secret in tap config.json.") headers = {} # Endpoint: simple API call to return a single record (org settings) to test access - url = 'https://{}/api/2.0/engage'.format(self.__api_domain) + url = f"https://{self.__api_domain}/api/2.0/engage" if self.__user_agent: - headers['User-Agent'] = self.__user_agent - headers['Accept'] = 'application/json' - headers['Authorization'] = 'Basic {}'.format( - str(base64.urlsafe_b64encode(self.__api_secret.encode("utf-8")), "utf-8")) + headers["User-Agent"] = self.__user_agent + headers["Accept"] = "application/json" + headers[ + "Authorization" + ] = f"Basic {str(base64.urlsafe_b64encode(self.__api_secret.encode('utf-8')), 'utf-8')}" try: response = self.__session.get( url=url, - timeout=self.__request_timeout, # Request timeout parameter - headers=headers) + timeout=self.__request_timeout, # Request timeout parameter + headers=headers, + ) except requests.exceptions.Timeout as err: - LOGGER.error('TIMEOUT ERROR: %s',str(err)) - raise ReadTimeoutError + LOGGER.error("TIMEOUT ERROR: %s", str(err)) + raise ReadTimeoutError from None if response.status_code == 402: # 402 Payment Requirement does not indicate a permissions or authentication error self.disable_engage_endpoint = True - LOGGER.warning('Mixpanel returned a 402 from the Engage API. Engage stream will be skipped.') + LOGGER.warning( + "Mixpanel returned a 402 from the Engage API. Engage stream will be skipped." + ) return True elif response.status_code != 200: - LOGGER.error('Error status_code = {}'.format(response.status_code)) + LOGGER.error("Error status_code = %s", response.status_code) raise_for_error(response) - else: - return True + return True @backoff.on_exception( backoff.expo, (Server5xxError, Server429Error, ReadTimeoutError, ConnectionError, Timeout), max_tries=BACKOFF_MAX_TRIES_REQUEST, factor=3, - logger=LOGGER) - def perform_request(self, - method, - url=None, - params=None, - json=None, - stream=False, - **kwargs): + logger=LOGGER, + ) + def perform_request( + self, method, url=None, params=None, json=None, stream=False, **kwargs + ): + """Call rest API and return the response in case of status code 200. + + Args: + method (str): GET or POST method. + url (str, optional): Complete url for the stream. Defaults to None. + params (dict, optional): Query params. Defaults to None. + json (dict, optional): JSON data (For POST request). Defaults to None. + stream (bool, optional): If False, a response transfers indicating that + the file should download immediately. If True, stream the file. + Defaults to False. + + Raises: + Server5xxError: Raises if status code > 500 + ReadTimeoutError: Raises if request timeouts. + + Returns: + dict: With status code 200, returns JSON formatted response. + """ try: - response = self.__session.request(method=method, - url=url, - params=params, - json=json, - stream=stream, - timeout=self.__request_timeout, # Request timeout parameter - **kwargs) + response = self.__session.request( + method=method, + url=url, + params=params, + json=json, + stream=stream, + timeout=self.__request_timeout, # Request timeout parameter + **kwargs, + ) if response.status_code > 500: raise Server5xxError() @@ -201,89 +247,111 @@ def perform_request(self, raise_for_error(response) return response except requests.exceptions.Timeout as err: - LOGGER.error('TIMEOUT ERROR: %s',str(err)) - raise ReadTimeoutError(err) + LOGGER.error("TIMEOUT ERROR: %s", str(err)) + raise ReadTimeoutError(err) from None def request(self, method, url=None, path=None, params=None, json=None, **kwargs): + """Request method to return JSON response of HTTP call. + + Args: + method (str): GET or POST method. + url (str, optional): Base URL. Defaults to None. + path (str, optional): Path for the stream. Defaults to None. + params (dict, optional): Query params. Defaults to None. + json (dict, optional): JSON data (For POST requests). Defaults to None. + + Returns: + dict: JSON object of response. + """ if not self.__verified: self.__verified = self.check_access() if url and path: - url = '{}/{}'.format(url, path) + url = f"{url}/{path}" elif path and not url: - url = 'https://{}/api/2.0/{}'.format(self.__api_domain, path) + url = f"https://{self.__api_domain}/api/2.0/{path}" - if 'endpoint' in kwargs: - endpoint = kwargs['endpoint'] - del kwargs['endpoint'] + if "endpoint" in kwargs: + endpoint = kwargs["endpoint"] + del kwargs["endpoint"] else: endpoint = None - if 'headers' not in kwargs: - kwargs['headers'] = {} + if "headers" not in kwargs: + kwargs["headers"] = {} - kwargs['headers']['Accept'] = 'application/json' + kwargs["headers"]["Accept"] = "application/json" if self.__user_agent: - kwargs['headers']['User-Agent'] = self.__user_agent + kwargs["headers"]["User-Agent"] = self.__user_agent - if method == 'POST': - kwargs['headers']['Content-Type'] = 'application/json' + if method == "POST": + kwargs["headers"]["Content-Type"] = "application/json" - kwargs['headers']['Authorization'] = 'Basic {}'.format( - str(base64.urlsafe_b64encode(self.__api_secret.encode("utf-8")), "utf-8")) + kwargs["headers"][ + "Authorization" + ] = f"Basic {str(base64.urlsafe_b64encode(self.__api_secret.encode('utf-8')), 'utf-8')}" with metrics.http_request_timer(endpoint) as timer: - response = self.perform_request(method=method, - url=url, - params=params, - json=json, - **kwargs) + response = self.perform_request( + method=method, url=url, params=params, json=json, **kwargs + ) timer.tags[metrics.Tag.http_status_code] = response.status_code response_json = response.json() return response_json - def request_export(self, method, url=None, path=None, params=None, json=None, **kwargs): + def request_export( + self, method, url=None, path=None, params=None, json=None, **kwargs + ): + """Method to read jsonline from export stream response. + + Args: + method (str): HTTP request method. + url (str, optional): Base URL for the export endpoint. Defaults to None. + path (str, optional): Path to the stream(export). Defaults to None. + params (dict, optional): Request calls params. Defaults to None. + json (dict, optional): JSON data (For POST request). Defaults to None. + + Yields: + dict: Records of export stream. + """ if not self.__verified: self.__verified = self.check_access() if url and path: - url = '{}/{}'.format(url, path) + url = f"{url}/{path}" elif path and not url: - url = 'https://{}/api/2.0/{}'.format(self.__api_domain, path) + url = f"https://{self.__api_domain}/api/2.0/{path}" - if 'endpoint' in kwargs: - endpoint = kwargs['endpoint'] - del kwargs['endpoint'] + if "endpoint" in kwargs: + endpoint = kwargs["endpoint"] + del kwargs["endpoint"] else: - endpoint = 'export' + endpoint = "export" - if 'headers' not in kwargs: - kwargs['headers'] = {} + if "headers" not in kwargs: + kwargs["headers"] = {} - kwargs['headers']['Accept'] = 'application/json' + kwargs["headers"]["Accept"] = "application/json" if self.__user_agent: - kwargs['headers']['User-Agent'] = self.__user_agent + kwargs["headers"]["User-Agent"] = self.__user_agent - if method == 'POST': - kwargs['headers']['Content-Type'] = 'application/json' + if method == "POST": + kwargs["headers"]["Content-Type"] = "application/json" - kwargs['headers']['Authorization'] = 'Basic {}'.format( - str(base64.urlsafe_b64encode(self.__api_secret.encode("utf-8")), "utf-8")) + kwargs["headers"][ + "Authorization" + ] = f"Basic {str(base64.urlsafe_b64encode(self.__api_secret.encode('utf-8')), 'utf-8')}" with metrics.http_request_timer(endpoint) as timer: - response = self.perform_request(method=method, - url=url, - params=params, - json=json, - stream=True, - **kwargs) + response = self.perform_request( + method=method, url=url, params=params, json=json, stream=True, **kwargs + ) timer.tags[metrics.Tag.http_status_code] = response.status_code - # export endpoint returns jsonl results; - # other endpoints return json with array of results + # 'export' endpoint returns jsonl results; + # Other endpoints return json with array of results # jsonlines reference: https://jsonlines.readthedocs.io/en/latest/ reader = jsonlines.Reader(response.iter_lines()) - for record in reader.iter(allow_none=True, skip_empty=True): - yield record + yield from reader.iter(allow_none=True, skip_empty=True) diff --git a/tap_mixpanel/discover.py b/tap_mixpanel/discover.py index dfd3526..498b988 100644 --- a/tap_mixpanel/discover.py +++ b/tap_mixpanel/discover.py @@ -1,8 +1,20 @@ from singer.catalog import Catalog, CatalogEntry, Schema + from tap_mixpanel.schema import get_schemas from tap_mixpanel.streams import STREAMS + def discover(client, properties_flag): + """Run the discovery mode, prepare the catalog file and return catalog. + + Args: + client (MixpanelClient): Client object to make http calls. + properties_flag (str): Setting this argument to `true` ensures that new properties on + events and engage records are captured. + + Returns: + singer.Catalog: Catalog object having schema and metadata of all the streams. + """ schemas, field_metadata = get_schemas(client, properties_flag) catalog = Catalog([]) @@ -10,12 +22,14 @@ def discover(client, properties_flag): schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] - catalog.streams.append(CatalogEntry( - stream=stream_name, - tap_stream_id=stream_name, - key_properties=STREAMS[stream_name].key_properties, - schema=schema, - metadata=mdata - )) + catalog.streams.append( + CatalogEntry( + stream=stream_name, + tap_stream_id=stream_name, + key_properties=STREAMS[stream_name].key_properties, + schema=schema, + metadata=mdata, + ) + ) return catalog diff --git a/tap_mixpanel/schema.py b/tap_mixpanel/schema.py index 30a8506..8b3bcb0 100644 --- a/tap_mixpanel/schema.py +++ b/tap_mixpanel/schema.py @@ -1,22 +1,45 @@ -import os import json +import os + +import singer from singer import metadata + from tap_mixpanel.streams import STREAMS -import singer LOGGER = singer.get_logger() # Reference: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata + def get_abs_path(path): + """Get the absolute path for the schema files. + + Args: + path (str): Path from current folder to schema file. + + Returns: + str: Full path to schema file. + """ return os.path.join(os.path.dirname(os.path.realpath(__file__)), path) def get_schema(client, properties_flag, stream_name): - schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) + """Creates schema for a stream by loading schema file and appending dynamic + fields schema if necessary. + + Args: + client (MixpanelClient): Client to make http calls. + properties_flag (str): Setting this argument to `true` ensures that new properties on + events and engage records are captured. + stream_name (str): Name of stream whose schema is to create. + + Returns: + dict: Returns schema of the stream. + """ + schema_path = get_abs_path(f"schemas/{stream_name}.json") - with open(schema_path) as file: + with open(schema_path, encoding="utf-8") as file: schema = json.load(file) # Set whether to allow additional properties for engage and export endpoints @@ -24,58 +47,45 @@ def get_schema(client, properties_flag, stream_name): # when the Event or Engage (user/person) was created. # Depending on the tap config parameter select_properties_by_default, # the json schema should allow additional properties (additionalProperties = true). - if stream_name in ('engage', 'export') and str(properties_flag).lower() == 'true': - schema['additionalProperties'] = True + if stream_name in ("engage", "export") and str(properties_flag).lower() == "true": + schema["additionalProperties"] = True else: - schema['additionalProperties'] = False + schema["additionalProperties"] = False - if stream_name == 'engage': + if stream_name == "engage": properties = client.request( - method='GET', - url='https://{}/api/2.0'.format(client.__api_domain), - path='engage/properties', - params={'limit': 2000}, - endpoint='engage_properties') - if properties.get('status') == 'ok': - results = properties.get('results', {}) + method="GET", + url=f"https://{client.__api_domain}/api/2.0", + path="engage/properties", + params={"limit": 2000}, + endpoint="engage_properties", + ) + if properties.get("status") == "ok": + results = properties.get("results", {}) for key, val in results.items(): - if key[0:1] == '$': - new_key = 'mp_reserved_{}'.format(key[1:]) + if key[0:1] == "$": + new_key = f"mp_reserved_{key[1:]}" else: new_key = key # property_type: string, number, boolean, datetime, object, list # Reference: # https://help.mixpanel.com/hc/en-us/articles/115004547063-Properties-Supported-Data-Types - property_type = val.get('type') + property_type = val.get("type") types = { - 'boolean': { - 'type': ['null', 'boolean'] - }, - 'number': { - 'type': ['null', 'string'], - 'format': 'singer.decimal' - }, - 'datetime': { - 'type': ['null', 'string'], - 'format': 'date-time' - }, - 'object': { - 'type': ['null', 'object'], - 'additionalProperties': True - }, - 'list': { - 'type': ['null', 'array'], - 'required': False, - 'items': {} + "boolean": {"type": ["null", "boolean"]}, + "number": {"type": ["null", "string"], "format": "singer.decimal"}, + "datetime": {"type": ["null", "string"], "format": "date-time"}, + "object": { + "type": ["null", "object"], + "additionalProperties": True, }, - 'string': { - 'type': ['null', 'string'] - } + "list": {"type": ["null", "array"], "required": False, "items": {}}, + "string": {"type": ["null", "string"]}, } - if property_type in types.keys(): + if property_type in types: # Make the types a list containing all types starting with the one returned to us by the API this_type = [types.pop(property_type)] this_type += list(types.values()) @@ -83,40 +93,52 @@ def get_schema(client, properties_flag, stream_name): else: this_type = list(types.values()) + schema["properties"][new_key] = {"anyOf": this_type} - schema['properties'][new_key] = {'anyOf': this_type} - - if stream_name == 'export': + if stream_name == "export": # Event properties endpoint: # https://developer.mixpanel.com/docs/data-export-api#section-hr-span-style-font-family-courier-top-span results = client.request( - method='GET', - url='https://{}/api/2.0'.format(client.__api_domain), - path='events/properties/top', - params={'limit': 2000}, - endpoint='event_properties') + method="GET", + url=f"https://{client.__api_domain}/api/2.0", + path="events/properties/top", + params={"limit": 2000}, + endpoint="event_properties", + ) for key, val in results.items(): - if key[0:1] == '$': - new_key = 'mp_reserved_{}'.format(key[1:]) + if key[0:1] == "$": + new_key = f"mp_reserved_{key[1:]}" else: new_key = key - # string ONLY for event properties (no other datatypes) + # String ONLY for event properties (no other datatypes) # Reference: https://help.mixpanel.com/hc/en-us/articles/360001355266-Event-Properties#field-size-character-limits-for-event-properties - schema['properties'][new_key] = { - 'type': ['null', 'string'] - } + schema["properties"][new_key] = {"type": ["null", "string"]} return schema + def get_schemas(client, properties_flag): + """Load the schema references, prepare metadata for each streams and return + schema and metadata for the catalog. + + Args: + client (MixpanelClient): Client object to make http calls. + properties_flag (bool): Setting this argument to true ensures that new properties on + events and engage records are captured. + + Returns: + tuple: Returns tuple of Schemas and metadata. + """ schemas = {} field_metadata = {} for stream_name, stream_metadata in STREAMS.items(): # When the client detects disable_engage_endpoint, skip discovering the stream - if stream_name == 'engage' and client.disable_engage_endpoint: - LOGGER.warning('Mixpanel returned a 402 indicating the Engage endpoint and stream is unavailable. Skipping.') + if stream_name == "engage" and client.disable_engage_endpoint: + LOGGER.warning( + "Mixpanel returned a 402 indicating the Engage endpoint and stream is unavailable. Skipping." + ) continue schema = get_schema(client, properties_flag, stream_name) @@ -132,17 +154,18 @@ def get_schemas(client, properties_flag): schema=schema, key_properties=stream_metadata.key_properties, valid_replication_keys=stream_metadata.replication_keys, - replication_method=stream_metadata.replication_method + replication_method=stream_metadata.replication_method, ) mdata = metadata.to_map(mdata) if stream_metadata.replication_keys: - mdata = metadata.write( - mdata, - ('properties', stream_metadata.replication_keys[0]), - 'inclusion', - 'automatic') + mdata = metadata.write( + mdata, + ("properties", stream_metadata.replication_keys[0]), + "inclusion", + "automatic", + ) mdata = metadata.to_list(mdata) diff --git a/tap_mixpanel/streams.py b/tap_mixpanel/streams.py index c57cd60..e4cfd69 100644 --- a/tap_mixpanel/streams.py +++ b/tap_mixpanel/streams.py @@ -1,6 +1,4 @@ -""" -This module defines the stream classes and their individual sync logic. -""" +"""This module defines the stream classes and their individual sync logic.""" import json import math @@ -12,7 +10,7 @@ from singer.utils import strptime_to_utc from tap_mixpanel.client import MixpanelClient -from tap_mixpanel.transform import transform_record, transform_datetime +from tap_mixpanel.transform import transform_datetime, transform_record LOGGER = singer.get_logger() @@ -22,6 +20,7 @@ class MixPanel: A base class representing singer streams. :param client: The API client used to extract records from external source """ + tap_stream_id = None replication_method = None replication_keys = [] @@ -43,20 +42,47 @@ def __init__(self, client: MixpanelClient): self.client = client def write_schema(self, catalog, stream_name): + """Writes the schema of the stream form the catalog. + + Args: + catalog (singer.Catalog): Catalog object having schema and metadata of all the streams. + stream_name (str): Name of the syncing stream. + + Raises: + err: Raises if any error occur while writing schema. + """ stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() try: singer.write_schema(stream_name, schema, stream.key_properties) except OSError as err: - LOGGER.error("OS Error writing schema for: %s",stream_name) + LOGGER.error("OS Error writing schema for: %s", stream_name) raise err def get_bookmark(self, state, stream, default): + """Get the bookmark value from the state if available in the state. + Else return start date. + + Args: + state (dict): State containing bookmarks of the streams if available. + stream (str): Name of the stream to get the bookmark. + default (str): Default value to return if bookmark is not available.(start_date) + + Returns: + str: Returns bookmark value. + """ if (state is None) or ("bookmarks" not in state): return default return state.get("bookmarks", {}).get(stream, default) def write_bookmark(self, state, stream, value): + """Updates the stream bookmark value in the state. And writes the state. + + Args: + state (dict): State containing bookmarks of the streams if available. + stream (str): Name of stream whose bookmark will be written. + value (str): Bookmark value of the stream. + """ if "bookmarks" not in state: state["bookmarks"] = {} state["bookmarks"][stream] = value @@ -73,6 +99,25 @@ def process_records( max_bookmark_value=None, last_datetime=None, ): + """Transform the record as per the schema and writes record if replication value > bookmark. + + Args: + stream_name (str): Name of the syncing stream. + records (dict): Record to be written. + time_extracted (datetime): Datetime when the data was extracted from the API + bookmark_field (str, optional): Bookmark field in the state if stream is INCREMENTAL. + Defaults to None. + max_bookmark_value (str, optional): Maximum bookmark value if written records if replication key + is available. Defaults to None. + last_datetime (str, optional): Last datetime from which greater replication value records will be written. + Defaults to None. + + Raises: + err: Raises exception if transformation error occurs. + + Returns: + tuple: Tuple of maximum bookmark value if written records and written records count. + """ stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) @@ -86,9 +131,10 @@ def process_records( record, schema, stream_metadata ) except Exception as err: - LOGGER.error("Error: %s",str(err)) - LOGGER.error("For schema: %s", - json.dumps(schema, sort_keys=True, indent=2) + LOGGER.error("Error: %s", str(err)) + LOGGER.error( + "For schema: %s", + json.dumps(schema, sort_keys=True, indent=2), ) raise err @@ -123,34 +169,67 @@ def process_records( return max_bookmark_value, counter.value def get_and_transform_records( - self, querystring, project_timezone, max_bookmark_value, catalog, last_datetime, endpoint_total, - limit, total_records, parent_total, record_count, page, offset, parent_record, date_total): - """ - Get the records using the client get request and transform it using transform_records - and return the max_bookmark_value + self, + querystring, + project_timezone, + max_bookmark_value, + catalog, + last_datetime, + endpoint_total, + limit, + total_records, + parent_total, + record_count, + page, + offset, + parent_record, + date_total, + ): + """Get the records using the client get request and transform it using + transform_records and return the max_bookmark_value. + + Args: + querystring (str): Params in URL query format to join with stream path + project_timezone (str): Time zone in which integer date times are stored. + max_bookmark_value (str): Maximum bookmark value among written records. + catalog (singer.Catalog): Catalog object having schema and metadata of all the streams. + last_datetime (str): Last datetime from which greater replication value records will be written. + endpoint_total (int): Total number of records written yet. + limit (int): Page size. + total_records (int): Total number of records available for the sync. + parent_total (int): Total records for parent ID + record_count (int): Number of records per page written by tap. + page (int): Page count. + offset (int): Offset value of stream data for the pagination. + parent_record (dict): Record of parent stream. + date_total (int): Total records written for the date window. + + Raises: + Exception: Raises if any key-property is missing. + + Returns: + tuple: Returns tuple of parent_total, date_total, offset, page, session_id, + endpoint_total, max_bookmark_value, total_records """ session_id = None data = self.client.request( - method='GET', + method="GET", url=self.url, path=self.path, params=querystring, - endpoint=self.tap_stream_id) + endpoint=self.tap_stream_id, + ) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() - full_url = '{}/{}{}'.format( - self.url, - self.path, - '?{}'.format(querystring) if querystring else '') + full_url = f"{self.url}/{self.path}{f'?{querystring}' if querystring else ''}" if not data: - LOGGER.info('No data for URL: %s',full_url) + LOGGER.info("No data for URL: %s", full_url) # No data results - else: # has data + else: # Has data # Transform data with transform_json from transform.py # The data_key identifies the array/list of records below the element - # LOGGER.info('data = {}'.format(data)) # TESTING, comment out transformed_data = [] # initialize the record list # Endpoints: funnels, revenue return results as dictionary for each date @@ -159,10 +238,10 @@ def get_and_transform_records( results = {} results_list = [] for key, val in data[self.data_key].items(): - # skip $overall summary - if key != '$overall': - val['date'] = key - val['datetime'] = '{}T00:00:00Z'.format(key) + # Skip $overall summary + if key != "$overall": + val["date"] = key + val["datetime"] = f"{key}T00:00:00Z" results_list.append(val) results[self.data_key] = results_list data = results @@ -170,35 +249,32 @@ def get_and_transform_records( # Cohorts endpoint returns results as a list/array (no data_key) # All other endpoints have a data_key if not self.data_key: - self.data_key = 'results' - new_data = { - 'results': data - } + self.data_key = "results" + new_data = {"results": data} data = new_data transformed_data = [] # Loop through result records for record in data[self.data_key]: - # transform record and append to transformed_data array - transformed_record = transform_record(record, - self.tap_stream_id, - project_timezone, - parent_record) + # Transform record and append to transformed_data array + transformed_record = transform_record( + record, self.tap_stream_id, project_timezone, parent_record + ) transformed_data.append(transformed_record) # Check for missing keys for key in self.key_properties: val = transformed_record.get(key) if not val: - LOGGER.error('Error: Missing Key') - raise 'Missing Key' + LOGGER.error("Error: Missing Key") + raise Exception("Missing Key") # End data record loop if not transformed_data: - LOGGER.info('No transformed data for data = %s', data) - # No transformed data results - else: # has transformed data + LOGGER.info("No transformed data for data = %s", data) + # No transformed data results + else: # Has transformed data # Process records and get the max_bookmark_value and record_count max_bookmark_value, record_count = self.process_records( catalog=catalog, @@ -207,47 +283,73 @@ def get_and_transform_records( time_extracted=time_extracted, bookmark_field=next(iter(self.replication_keys), None), max_bookmark_value=max_bookmark_value, - last_datetime=last_datetime) - LOGGER.info('Stream %s, batch processed %s records', self.tap_stream_id, record_count) + last_datetime=last_datetime, + ) + LOGGER.info( + "Stream %s, batch processed %s records", + self.tap_stream_id, + record_count, + ) - # set total_records and pagination fields + # Set total_records and pagination fields if page == 0: if isinstance(data, dict): - total_records = data.get('total', record_count) + total_records = data.get("total", record_count) else: total_records = record_count parent_total = parent_total + record_count date_total = date_total + record_count endpoint_total = endpoint_total + record_count if isinstance(data, dict): - session_id = data.get('session_id', None) + session_id = data.get("session_id", None) # to_rec: to record; ending record for the batch page if self.pagination: to_rec = offset + limit - if to_rec > total_records: - to_rec = total_records + to_rec = min(to_rec, total_records) else: to_rec = record_count - LOGGER.info('Synced Stream: %s, page: %s, %s to %s of total: %s', - self.tap_stream_id, - page, - offset, - to_rec, - total_records) + LOGGER.info( + "Synced Stream: %s, page: %s, %s to %s of total: %s", + self.tap_stream_id, + page, + offset, + to_rec, + total_records, + ) # End has transformed data # End has data results # Pagination: increment the offset by the limit (batch-size) and page offset = offset + limit page = page + 1 - return parent_total, date_total, offset, page, session_id, endpoint_total, max_bookmark_value, total_records - - def define_bookmark_filters(self, days_interval, last_datetime, now_datetime, attribution_window): - """ - define the params from and to according to the filters provided in - the bookmark_query_field_from and bookmark_query_field_to + return ( + parent_total, + date_total, + offset, + page, + session_id, + endpoint_total, + max_bookmark_value, + total_records, + ) + + def define_bookmark_filters( + self, days_interval, last_datetime, now_datetime, attribution_window + ): + """Define the params from and to according to the filters provided in + the bookmark_query_field_from and bookmark_query_field_to. + + Args: + days_interval (int): Interval in days between start_window and end_window + last_datetime (str): Last datetime from records will be fetched. + now_datetime (datetime): Current datetime from sync started. + attribution_window (int): Latency minimum number of days to look-back to + account for delays in attributing accurate results. + + Returns: + tuple: Returns tuple if start_window, date_window and interval """ if self.bookmark_query_field_from and self.bookmark_query_field_to: # days_interval from config date_window_size, default = 60; passed to function from sync @@ -258,57 +360,69 @@ def define_bookmark_filters(self, days_interval, last_datetime, now_datetime, at delta_days = (now_datetime - last_dttm).days if delta_days <= attribution_window: delta_days = attribution_window - LOGGER.info("Start bookmark less than %s day attribution window.", attribution_window) + LOGGER.info( + "Start bookmark less than %s day attribution window.", + attribution_window, + ) elif delta_days >= 365: delta_days = 365 - LOGGER.warning("Start date or bookmark greater than 1 year maxiumum.") + LOGGER.warning("Start date or bookmark greater than 1 year maximum.") LOGGER.warning("Setting bookmark start to 1 year ago.") start_window = now_datetime - timedelta(days=delta_days) end_window = start_window + timedelta(days=days_interval) - if end_window > now_datetime: - end_window = now_datetime + end_window = min(end_window, now_datetime) else: start_window = strptime_to_utc(last_datetime) end_window = now_datetime diff_sec = (end_window - start_window).seconds - # round-up difference to days + # Round-up difference to days days_interval = math.ceil(diff_sec / (3600 * 24)) return start_window, end_window, days_interval def sync(self, state, catalog, config, start_date): - # the sync method common to all the streams which internally call methods depending on different endpoints + """The sync method common to all the streams which internally call methods depending on different endpoints. + + Args: + state (dict): State containing bookmarks of the streams if available. + catalog (singer.Catalog): Catalog object having schema and metadata of all the streams. + config (dict): The tap config file for this tap should include these entries. + start_date (str): The default value to use if no bookmark exists for an endpoint + + Returns: + int: Returns total number of records. + """ bookmark_field = next(iter(self.replication_keys), None) project_timezone = config.get("project_timezone", "UTC") days_interval = int(config.get("date_window_size", "30")) attribution_window = int(config.get("attribution_window", "5")) - #Update url if eu_residency is selected - if str(config.get('eu_residency')).lower() == "true": - if self.tap_stream_id == 'export': - self.url = 'https://data-eu.mixpanel.com/api/2.0' + # Update url if eu_residency is selected + if str(config.get("eu_residency")).lower() == "true": + if self.tap_stream_id == "export": + self.url = "https://data-eu.mixpanel.com/api/2.0" else: - self.url = 'https://eu.mixpanel.com/api/2.0' + self.url = "https://eu.mixpanel.com/api/2.0" # Get the latest bookmark for the stream and set the last_integer/datetime - last_datetime = self.get_bookmark( - state, self.tap_stream_id, start_date) + last_datetime = self.get_bookmark(state, self.tap_stream_id, start_date) max_bookmark_value = last_datetime self.write_schema(catalog, self.tap_stream_id) - # windowing: loop through date days_interval date windows from last_datetime to now_datetime + # Windowing: loop through date days_interval date windows from last_datetime to now_datetime tzone = pytz.timezone(project_timezone) now_datetime = datetime.now(tzone) - end_date = config.get('end_date') + end_date = config.get("end_date") if end_date: now_datetime = strptime_to_utc(end_date) start_window, end_window, days_interval = self.define_bookmark_filters( - days_interval, last_datetime, now_datetime, attribution_window) + days_interval, last_datetime, now_datetime, attribution_window + ) # LOOP order: Date Windows, Parent IDs, Page # Initialize counter endpoint_total = 0 # Total for ALL: parents, date windows, and pages @@ -321,7 +435,7 @@ def sync(self, state, catalog, config, start_date): total_records = 0 # Total records for all pages record_count = 0 # Total processed for page - params = self.params # adds in endpoint specific, sort, filter params + params = self.params # Adds in endpoint specific, sort, filter params if self.bookmark_query_field_from and self.bookmark_query_field_to: # Request dates need to be normalized to project timezone or else errors may occur @@ -335,13 +449,15 @@ def sync(self, state, catalog, config, start_date): params[self.bookmark_query_field_from] = from_date params[self.bookmark_query_field_to] = to_date - # funnels and cohorts have a parent endpoint with parent_data and parent_id_field + # Funnels and cohorts have a parent endpoint with parent_data and parent_id_field if self.parent_path and self.parent_id_field: # API request data - LOGGER.info("URL for Parent Stream %s: %s/%s", - self.tap_stream_id, - self.url, - self.parent_path) + LOGGER.info( + "URL for Parent Stream %s: %s/%s", + self.tap_stream_id, + self.url, + self.parent_path, + ) parent_data = self.client.request( method="GET", url=self.url, @@ -355,9 +471,11 @@ def sync(self, state, catalog, config, start_date): for parent_record in parent_data: parent_id = parent_record.get(self.parent_id_field) - LOGGER.info('START: Stream: %s, parent_id: %s', self.tap_stream_id, parent_id) + LOGGER.info( + "START: Stream: %s, parent_id: %s", self.tap_stream_id, parent_id + ) - # pagination: loop thru all pages of data using next (if not None) + # Pagination: loop thru all pages of data using next (if not None) page = 0 # First page is page=0, second page is page=1, ... offset = 0 limit = 250 # Default page_size @@ -366,42 +484,66 @@ def sync(self, state, catalog, config, start_date): total_records = 0 # Total records for all pages record_count = 0 # Total processed for page - session_id = 'initial' + session_id = "initial" if self.pagination: - params['page_size'] = limit + params["page_size"] = limit # Popped session_id and page number of last parents stream call. - params.pop('session_id', None) - params.pop('page', None) + params.pop("session_id", None) + params.pop("page", None) while offset <= total_records and session_id is not None: if self.pagination and page != 0: - params['session_id'] = session_id - params['page'] = page + params["session_id"] = session_id + params["page"] = page # querystring: Squash query params into string and replace [parent_id] - querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]) - querystring = querystring.replace('[parent_id]', str(parent_id)) + querystring = "&".join( + [f"{key}={value}" for (key, value) in params.items()] + ) + querystring = querystring.replace("[parent_id]", str(parent_id)) - full_url = '{}/{}{}'.format( - self.url, - self.path, - '?{}'.format(querystring) if querystring else '') + full_url = f"{self.url}/{self.path}{f'?{querystring}' if querystring else ''}" - LOGGER.info('URL for Stream %s: %s', self.tap_stream_id, full_url) + LOGGER.info("URL for Stream %s: %s", self.tap_stream_id, full_url) # API request data # data = {} - parent_total, date_total, offset, page, session_id, endpoint_total, max_bookmark_value, total_records = self.get_and_transform_records( - querystring, project_timezone, max_bookmark_value, catalog, last_datetime, endpoint_total, limit, total_records, parent_total, record_count, page, offset, parent_record, date_total) - # End stream != 'export' - LOGGER.info('FINISHED: Stream: %s, parent_id: %s', self.tap_stream_id, parent_id) - LOGGER.info('Total records for parent: %s', parent_total) + ( + parent_total, + date_total, + offset, + page, + session_id, + endpoint_total, + max_bookmark_value, + total_records, + ) = self.get_and_transform_records( + querystring, + project_timezone, + max_bookmark_value, + catalog, + last_datetime, + endpoint_total, + limit, + total_records, + parent_total, + record_count, + page, + offset, + parent_record, + date_total, + ) + # End stream != 'export' + LOGGER.info( + "FINISHED: Stream: %s, parent_id: %s", self.tap_stream_id, parent_id + ) + LOGGER.info("Total records for parent: %s", parent_total) # End parent record loop - LOGGER.info("FINISHED Sync for Stream: %s",self.tap_stream_id) + LOGGER.info("FINISHED Sync for Stream: %s", self.tap_stream_id) if self.bookmark_query_field_from: - LOGGER.info("Date window from: %s to %s",from_date, to_date) + LOGGER.info("Date window from: %s to %s", from_date, to_date) LOGGER.info("Total records for date window: %s", date_total) # Increment date window start_window = end_window @@ -424,6 +566,7 @@ class Annotations(MixPanel): List the annotations for a given date range. Docs: https://developer.mixpanel.com/reference/annotations """ + tap_stream_id = "annotations" key_properties = ["date"] path = "annotations" @@ -438,9 +581,11 @@ class Annotations(MixPanel): class CohortMembers(MixPanel): """ The list endpoint returns all of the cohorts in a given project. - The JSON formatted return contains the cohort name, id, count, description, creation date, and visibility for every cohort in the project. + The JSON formatted return contains the cohort name, id, count, + description, creation date, and visibility for every cohort in the project. Docs: https://developer.mixpanel.com/reference/engage """ + tap_stream_id = "cohort_members" path = "engage" key_properties = ["cohort_id", "distinct_id"] @@ -457,9 +602,11 @@ class CohortMembers(MixPanel): class Cohorts(MixPanel): """ - Takes a JSON object with a single key called id whose value is the cohort ID. behaviors and filter_by_cohort are mutually exclusive. + Takes a JSON object with a single key called id whose value is the cohort ID. + behaviors and filter_by_cohort are mutually exclusive. Docs: https://developer.mixpanel.com/reference/cohorts """ + tap_stream_id = "cohorts" path = "cohorts/list" key_properties = ["id"] @@ -476,6 +623,7 @@ class Engage(MixPanel): Query user profile data and return list of users that fit specified parameters. Docs: https://developer.mixpanel.com/reference/engage """ + tap_stream_id = "engage" path = "engage" data_key = "results" @@ -495,6 +643,7 @@ class Export(MixPanel): complete with all event properties (including distinct_id) and the exact timestamp the event was fired. Docs: https://developer.mixpanel.com/reference/export """ + tap_stream_id = "export" path = "export" data_key = "results" @@ -506,36 +655,74 @@ class Export(MixPanel): params = {} def get_and_transform_records( - self, querystring, project_timezone, max_bookmark_value, catalog, last_datetime, endpoint_total, - limit, total_records, parent_total, record_count, page, offset, parent_record, date_total): + self, + querystring, + project_timezone, + max_bookmark_value, + catalog, + last_datetime, + endpoint_total, + limit, + total_records, + parent_total, + record_count, + page, + offset, + parent_record, + date_total, + ): """ Get the records using the client get request and transform it using transform_records - and return the max_bookmark_value + and return the max_bookmark_value. + + Args: + querystring (str): Params in URL query format to join with stream path + project_timezone (str): Time zone in which integer date times are stored. + max_bookmark_value (str): Maximum bookmark value among written records. + catalog (singer.Catalog): Catalog object having schema and metadata of all the streams. + last_datetime (str): Last datetime from which greater replication value records will be written. + endpoint_total (int): Total number of records written yet. + limit (int): Page size. + total_records (int): Total number of records available for the sync. + parent_total (int): # Total records for parent ID + record_count (int): Number of records per page written by tap. + page (int): Page count. + offset (int): Offset value of stream data for the pagination. + parent_record (dict): Record of parent stream. + date_total (int): Total records written for the date window. + + Raises: + Exception: Raises if any key-property is missing. + + Returns: + tuple: Returns tuple of parent_total, date_total, offset, page, session_id, + endpoint_total, max_bookmark_value, total_records """ data = self.client.request_export( - method='GET', + method="GET", url=self.url, path=self.path, params=querystring, - endpoint=self.tap_stream_id) + endpoint=self.tap_stream_id, + ) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() transformed_data = [] for record in data: - if record and str(record) != '': - # transform record and append to transformed_data array - transformed_record = transform_record(record, - self.tap_stream_id, - project_timezone) + if record and str(record) != "": + # Transform record and append to transformed_data array + transformed_record = transform_record( + record, self.tap_stream_id, project_timezone + ) transformed_data.append(transformed_record) # Check for missing keys for key in self.key_properties: val = transformed_record.get(key) if not val: - LOGGER.error('Error: Missing Key') - raise 'Missing Key' + LOGGER.error("Error: Missing Key") + raise Exception("Missing Key") if len(transformed_data) == limit: # Process full batch (limit = 250) records @@ -547,8 +734,13 @@ def get_and_transform_records( time_extracted=time_extracted, bookmark_field=next(iter(self.replication_keys), None), max_bookmark_value=max_bookmark_value, - last_datetime=last_datetime) - LOGGER.info('Stream %s, batch processed %s records', self.tap_stream_id, record_count) + last_datetime=last_datetime, + ) + LOGGER.info( + "Stream %s, batch processed %s records", + self.tap_stream_id, + record_count, + ) total_records = total_records + record_count parent_total = parent_total + record_count @@ -568,8 +760,13 @@ def get_and_transform_records( time_extracted=time_extracted, bookmark_field=next(iter(self.replication_keys), None), max_bookmark_value=max_bookmark_value, - last_datetime=last_datetime) - LOGGER.info('Stream %s, batch processed %s records', self.tap_stream_id, record_count) + last_datetime=last_datetime, + ) + LOGGER.info( + "Stream %s, batch processed %s records", + self.tap_stream_id, + record_count, + ) total_records = total_records + record_count parent_total = parent_total + record_count @@ -579,14 +776,25 @@ def get_and_transform_records( # Export does not provide pagination; session_id = None breaks out of loop. session_id = None - return parent_total, date_total, offset, page, session_id, endpoint_total, max_bookmark_value, total_records + return ( + parent_total, + date_total, + offset, + page, + session_id, + endpoint_total, + max_bookmark_value, + total_records, + ) class Funnels(MixPanel): """ - Get data for a funnel. funnel_id as a parameter to the API to get the funnel that you wish to get data for. + Get data for a funnel. + funnel_id as a parameter to the API to get the funnel that you wish to get data for. Docs: https://developer.mixpanel.com/reference/funnels """ + tap_stream_id = "funnels" path = "funnels" key_properties = ["funnel_id", "date"] diff --git a/tap_mixpanel/sync.py b/tap_mixpanel/sync.py index 613d703..cf0b218 100644 --- a/tap_mixpanel/sync.py +++ b/tap_mixpanel/sync.py @@ -6,14 +6,15 @@ def update_currently_syncing(state, stream_name): - """ - Currently syncing sets the stream currently being delivered in the state. + """Currently syncing sets the stream currently being delivered in the + state. + If the integration is interrupted, this state property is used to identify the starting point to continue from. Reference: https://github.com/singer-io/singer-python/blob/master/singer/bookmarks.py#L41-L46 """ - if (stream_name is None) and ('currently_syncing' in state): - del state['currently_syncing'] + if (stream_name is None) and ("currently_syncing" in state): + del state["currently_syncing"] else: singer.set_currently_syncing(state, stream_name) singer.write_state(state) @@ -25,26 +26,27 @@ def sync(client, config, catalog, state, start_date): last_stream = Previous currently synced stream, if the load was interrupted """ last_stream = singer.get_currently_syncing(state) - LOGGER.info('last/currently syncing stream: %s', last_stream) + LOGGER.info("last/currently syncing stream: %s", last_stream) selected_streams = [] for stream in catalog.get_selected_streams(state): selected_streams.append(stream.stream) - LOGGER.info('selected_streams: %s', selected_streams) + LOGGER.info("selected_streams: %s", selected_streams) if not selected_streams: return # Loop through selected_streams for stream_name in selected_streams: - LOGGER.info('START Syncing: %s', stream_name) + LOGGER.info("START Syncing: %s", stream_name) update_currently_syncing(state, stream_name) stream_obj = STREAMS[stream_name](client) endpoint_total = stream_obj.sync( - catalog=catalog, - state=state, - config=config, - start_date=start_date + catalog=catalog, state=state, config=config, start_date=start_date ) update_currently_syncing(state, None) - LOGGER.info('FINISHED Syncing: %s, Total endpoint records: %s', stream_name, endpoint_total) + LOGGER.info( + "FINISHED Syncing: %s, Total endpoint records: %s", + stream_name, + endpoint_total, + ) diff --git a/tap_mixpanel/transform.py b/tap_mixpanel/transform.py index 18d059f..042ec48 100644 --- a/tap_mixpanel/transform.py +++ b/tap_mixpanel/transform.py @@ -1,20 +1,31 @@ import datetime + import pytz import singer -from singer.utils import strftime from singer import Transformer +from singer.utils import strftime LOGGER = singer.get_logger() -# De-nest properties for engage and export endpoints + def denest_properties(record, properties_node): + """De-nest properties for engage and export endpoints. Write fields to + first level from `properties_node`. + + Args: + record (dict): Record to update. + properties_node (str): Nested object whose fields will be written at 1st level. + + Returns: + dict: Updated record + """ new_record = record properties = record.get(properties_node) if properties: for key, val in record[properties_node].items(): - if key[0:1] == '$': - new_key = 'mp_reserved_{}'.format(key[1:]) - # change this to regex + if key[0:1] == "$": + new_key = f"mp_reserved_{key[1:]}" + # Change this to regex else: new_key = key new_record[new_key] = val @@ -22,9 +33,17 @@ def denest_properties(record, properties_node): return new_record -# Time conversion from $time integer using project_timezone # Reference: https://help.mixpanel.com/hc/en-us/articles/115004547203-Manage-Timezones-for-Projects-in-Mixpanel#exporting-data-from-mixpanel def transform_event_times(record, project_timezone): + """Time conversion from $time integer using project_timezone. + + Args: + record (dict): Record to be transform. + project_timezone (str): Time zone in which integer date times are stored. + + Returns: + dict: Updated record. + """ new_record = record timezone = pytz.timezone(project_timezone) @@ -37,7 +56,7 @@ def transform_event_times(record, project_timezone): beginning_datetime = pytz.utc.localize(naive_datetime).astimezone(timezone) # Get integer time - time_int = int(record.get('time')) + time_int = int(record.get("time")) # Create new_time_utc by adding seconds to beginning_datetime, normalizing, # and converting to string @@ -46,52 +65,96 @@ def transform_event_times(record, project_timezone): # 'normalize' accounts for daylight savings time new_time_utc_str = strftime(timezone.normalize(new_time).astimezone(pytz.utc)) - new_record['time'] = new_time_utc_str + new_record["time"] = new_time_utc_str return new_record + def transform_datetime(this_dttm): - with Transformer() as transformer: - new_dttm = transformer._transform_datetime(this_dttm) - return new_dttm + """Transform date_time string TO DATETIME object. + + Args: + this_dttm (str): Formatted date-time string + + Returns: + datetime: Datetime object passed string. + """ + with Transformer() as transformer: + new_dttm = transformer._transform_datetime(this_dttm) + return new_dttm -# Remove leading $ from engage $distinct_id def transform_engage(record): + """Remove leading $ from engage $distinct_id. + + Args: + record (dict): record to be update. + + Returns: + dict: New updated record. + """ new_record = record - distinct_id = record.get('$distinct_id') - new_record['distinct_id'] = distinct_id - new_record.pop('$distinct_id', None) + distinct_id = record.get("$distinct_id") + new_record["distinct_id"] = distinct_id + new_record.pop("$distinct_id", None) return new_record -# Funnels: combine parent record with each date record def transform_funnels(record, parent_record): + """Funnels: Combine parent record with each date record + + Args: + record (dict): Record to be transform. + parent_record (dict): Parent record. + + Returns: + dict: Updated record. + """ record.update(parent_record) return record -# Cohort Members: provide all distinct_id's for each cohort_id def transform_cohort_members(record, parent_record): - cohort_id = parent_record.get('id') - distinct_id = record.get('$distinct_id') + """Cohort Members: provide all distinct_id's for each cohort_id. + + Args: + record (dict): Record to be transform. + parent_record (dict): Parent stream record. + + Returns: + dict: Record with id fields. + """ + cohort_id = parent_record.get("id") + distinct_id = record.get("$distinct_id") new_record = {} - new_record['distinct_id'] = distinct_id - new_record['cohort_id'] = cohort_id + new_record["distinct_id"] = distinct_id + new_record["cohort_id"] = cohort_id return new_record # Run other transforms, as needed: denest_list_nodes, transform_conversation_parts def transform_record(record, stream_name, project_timezone, parent_record=None): - if stream_name == 'engage': + """Transform record and add fields at first level as required by stream. + + Args: + record (dict): Record to be transform. + stream_name (str): Stream that record belongs to. + project_timezone (str): Time zone in which integer date times are stored. + parent_record (dict, optional): Parent stream record if current stream is child. + Defaults to None. + + Returns: + dict: Transformed record. + """ + if stream_name == "engage": trans_json = transform_engage(record) - new_record = denest_properties(trans_json, '$properties') - elif stream_name == 'export': - denested_json = denest_properties(record, 'properties') + new_record = denest_properties(trans_json, "$properties") + elif stream_name == "export": + denested_json = denest_properties(record, "properties") new_record = transform_event_times(denested_json, project_timezone) - elif stream_name == 'funnels': + elif stream_name == "funnels": new_record = transform_funnels(record, parent_record) - elif stream_name == 'cohort_members': + elif stream_name == "cohort_members": new_record = transform_cohort_members(record, parent_record) else: new_record = record diff --git a/tests/configuration/fixtures.py b/tests/configuration/fixtures.py index 1f53e9f..869f533 100644 --- a/tests/configuration/fixtures.py +++ b/tests/configuration/fixtures.py @@ -1,10 +1,14 @@ import pytest + from tap_mixpanel.client import MixpanelClient @pytest.fixture def mixpanel_client(): - # Support of request_timeout have been added. So, now MixpanelClient accept request_timeout parameter which is mandatory - mixpanel_client = MixpanelClient('API_SECRET', api_domain="mixpanel.com", request_timeout=1) # Pass extra request_timeout parameter + # Support of request_timeout have been added. + # So, now MixpanelClient accept request_timeout parameter which is mandatory + mixpanel_client = MixpanelClient( + "API_SECRET", api_domain="mixpanel.com", request_timeout=1 + ) # Pass extra request_timeout parameter mixpanel_client._MixpanelClient__verified = True return mixpanel_client diff --git a/tests/tap_tester/base.py b/tests/tap_tester/base.py index 5700364..b3e24c0 100644 --- a/tests/tap_tester/base.py +++ b/tests/tap_tester/base.py @@ -3,23 +3,19 @@ """ import os -import unittest from datetime import datetime as dt from datetime import timedelta import dateutil.parser import pytz - -from tap_tester import connections -from tap_tester import runner -from tap_tester import menagerie -from tap_tester.logger import LOGGER -from tap_tester.base_case import BaseCase +from tap_tester import LOGGER, BaseCase, connections, menagerie, runner class TestMixPanelBase(BaseCase): - """ Test the tap combined """ + """Test the tap combined.""" + START_DATE_FORMAT = "%Y-%m-%dT00:00:00Z" + BOOKMARK_COMPARISON_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" REPLICATION_KEYS = "valid-replication-keys" PRIMARY_KEYS = "table-key-properties" FOREIGN_KEYS = "table-foreign-key-properties" @@ -28,49 +24,56 @@ class TestMixPanelBase(BaseCase): FULL_TABLE = "FULL_TABLE" API_LIMIT = 250 TYPE = "platform.mixpanel" + OBEYS_START_DATE = "obey-start-date" start_date = "" end_date = "" eu_residency = True def tap_name(self): - """The name of the tap""" + """The name of the tap.""" return "tap-mixpanel" def expected_metadata(self): - """The expected streams and metadata about the streams""" + """The expected streams and metadata about the streams.""" return { - 'export': { + "export": { self.PRIMARY_KEYS: set(), self.REPLICATION_METHOD: self.INCREMENTAL, - self.REPLICATION_KEYS: {'time'}, + self.REPLICATION_KEYS: {"time"}, + self.OBEYS_START_DATE: True, }, - 'engage': { + "engage": { self.PRIMARY_KEYS: {"distinct_id"}, self.REPLICATION_METHOD: self.FULL_TABLE, + self.OBEYS_START_DATE: False, }, - 'funnels': { - self.PRIMARY_KEYS: {'funnel_id', 'date'}, + "funnels": { + self.PRIMARY_KEYS: {"funnel_id", "date"}, self.REPLICATION_METHOD: self.INCREMENTAL, - self.REPLICATION_KEYS: {'datetime'}, + self.REPLICATION_KEYS: {"datetime"}, + self.OBEYS_START_DATE: True, }, - 'cohorts': { + "cohorts": { self.PRIMARY_KEYS: {"id"}, self.REPLICATION_METHOD: self.FULL_TABLE, + self.OBEYS_START_DATE: False, }, - 'cohort_members': { + "cohort_members": { self.PRIMARY_KEYS: {"cohort_id", "distinct_id"}, self.REPLICATION_METHOD: self.FULL_TABLE, + self.OBEYS_START_DATE: False, }, - 'revenue': { + "revenue": { self.PRIMARY_KEYS: {"date"}, self.REPLICATION_METHOD: self.INCREMENTAL, - self.REPLICATION_KEYS: {'datetime'}, + self.REPLICATION_KEYS: {"datetime"}, + self.OBEYS_START_DATE: True, }, - - 'annotations': { + "annotations": { self.PRIMARY_KEYS: {"date"}, - self.REPLICATION_METHOD: self.FULL_TABLE - } + self.REPLICATION_METHOD: self.FULL_TABLE, + self.OBEYS_START_DATE: False, + }, } def setUp(self): @@ -90,23 +93,25 @@ def setUp(self): BaseCase.setUp(self) def get_type(self): - """the expected url route ending""" + """The expected url route ending.""" return "platform.mixpanel" def get_properties(self, original: bool = True): """Configuration properties required for the tap.""" return_value = { - 'start_date': '2020-02-01T00:00:00Z', - 'end_date': '2020-03-01T00:00:00Z', - 'date_window_size': '30', - 'attribution_window': '5', - 'project_timezone': 'US/Pacific', - "eu_residency": 'false', - 'select_properties_by_default': 'false' + "start_date": "2020-02-01T00:00:00Z", + "end_date": "2020-03-01T00:00:00Z", + "date_window_size": "30", + "attribution_window": "5", + "project_timezone": "US/Pacific", + "eu_residency": "false", + "select_properties_by_default": "false", } if self.eu_residency: - return_value.update({"project_timezone": "Europe/Amsterdam", "eu_residency": 'true'}) + return_value.update( + {"project_timezone": "Europe/Amsterdam", "eu_residency": "true"} + ) if original: return return_value @@ -118,7 +123,10 @@ def get_start_date(self): return dt.strftime(dt.utcnow() - timedelta(days=30), self.START_DATE_FORMAT) def get_credentials(self): - """Authentication information for the test account. Api secret is expected as a property.""" + """ + Authentication information for the test account. + Api secret is expected as a property. + """ credentials_dict = {} if self.eu_residency: @@ -132,10 +140,10 @@ def get_credentials(self): return credentials_dict def expected_streams(self): - """A set of expected stream names""" + """A set of expected stream names.""" - # Skip `export` and `revenue` stream for EU recidency server as - # revenue stream endpoint returns 400 bad reuqest and + # Skip `export` and `revenue` stream for EU residency server as + # revenue stream endpoint returns 400 bad request and # export stream endpoint returns 200 terminated early response. # So, as per discussion decided that let the customer come with the issues # that these streams are not working. Skip the streams in the circleci. @@ -145,29 +153,35 @@ def expected_streams(self): return set(self.expected_metadata().keys()) def expected_pks(self): - """return a dictionary with key of table name and value as a set of primary key fields""" - return {table: properties.get(self.PRIMARY_KEYS, set()) - for table, properties - in self.expected_metadata().items()} + """Return a dictionary with key of table name and value as a set of primary key fields""" + return { + table: properties.get(self.PRIMARY_KEYS, set()) + for table, properties in self.expected_metadata().items() + } def expected_replication_keys(self): - """return a dictionary with key of table name and value as a set of replication key fields""" - return {table: properties.get(self.REPLICATION_KEYS, set()) - for table, properties - in self.expected_metadata().items()} + """Return a dictionary with key of table name and value as a set of replication key fields""" + return { + table: properties.get(self.REPLICATION_KEYS, set()) + for table, properties in self.expected_metadata().items() + } def expected_replication_method(self): - """return a dictionary with key of table name nd value of replication method""" - return {table: properties.get(self.REPLICATION_METHOD, None) - for table, properties - in self.expected_metadata().items()} + """Return a dictionary with key of table name and value of replication method""" + return { + table: properties.get(self.REPLICATION_METHOD, None) + for table, properties in self.expected_metadata().items() + } def expected_automatic_fields(self): - """return a dictionary with key of table name and value as a set of automatic key fields""" + """Return a dictionary with key of table name and value as a set of automatic key fields""" auto_fields = {} for k, v in self.expected_metadata().items(): - auto_fields[k] = v.get(self.PRIMARY_KEYS, set()) | v.get(self.REPLICATION_KEYS, set()) \ + auto_fields[k] = ( + v.get(self.PRIMARY_KEYS, set()) + | v.get(self.REPLICATION_KEYS, set()) | v.get(self.FOREIGN_KEYS, set()) + ) return auto_fields ######################### @@ -188,15 +202,18 @@ def run_and_verify_check_mode(self, conn_id): menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) - self.assertGreater(len( - found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) + self.assertGreater( + len(found_catalogs), + 0, + msg=f"Unable to locate schemas for connection {conn_id}", + ) - found_catalog_names = set( - map(lambda c: c['stream_name'], found_catalogs)) + found_catalog_names = set(map(lambda c: c["stream_name"], found_catalogs)) subset = self.expected_streams().issubset(found_catalog_names) self.assertTrue( - subset, msg="Expected check streams are not subset of discovered catalog") + subset, msg="Expected check streams are not subset of discovered catalog" + ) LOGGER.info("discovered schemas are OK") return found_catalogs @@ -217,87 +234,97 @@ def run_and_verify_sync(self, conn_id): # Verify actual rows were synced sync_record_count = runner.examine_target_output_file( - self, conn_id, self.expected_streams(), self.expected_pks()) + self, conn_id, self.expected_streams(), self.expected_pks() + ) self.assertGreater( - sum(sync_record_count.values()), 0, - msg="failed to replicate any data: {}".format(sync_record_count) + sum(sync_record_count.values()), + 0, + msg=f"failed to replicate any data: {sync_record_count}", ) LOGGER.info(f"total replicated row count: {sum(sync_record_count.values())}") return sync_record_count - def perform_and_verify_table_and_field_selection(self, conn_id, test_catalogs, select_all_fields=True): + def perform_and_verify_table_and_field_selection( + self, conn_id, test_catalogs, select_all_fields=True + ): """ Perform table and field selection based off of the streams to select set and field selection parameters. - Verify this results in the expected streams selected and all or no - fields selected for those streams. + Verify this results in the expected streams selected and all or + no fields selected for those streams. """ # Select all available fields or select no fields from all testable streams - self.select_all_streams_and_fields( - conn_id, test_catalogs, select_all_fields) + self.select_all_streams_and_fields(conn_id, test_catalogs, select_all_fields) catalogs = menagerie.get_catalogs(conn_id) # Ensure our selection affects the catalog - expected_selected = [tc.get('stream_name') for tc in test_catalogs] + expected_selected = [cat.get("stream_name") for cat in test_catalogs] for cat in catalogs: - catalog_entry = menagerie.get_annotated_schema( - conn_id, cat['stream_id']) + catalog_entry = menagerie.get_annotated_schema(conn_id, cat["stream_id"]) # Verify all testable streams are selected - selected = catalog_entry.get('annotated-schema').get('selected') + selected = catalog_entry.get("annotated-schema").get("selected") LOGGER.info(f"Validating selection on {cat['stream_name']}: {selected}") - if cat['stream_name'] not in expected_selected: - self.assertFalse( - selected, msg="Stream selected, but not testable.") + if cat["stream_name"] not in expected_selected: + self.assertFalse(selected, msg="Stream selected, but not testable.") continue # Skip remaining assertions if we aren't selecting this stream self.assertTrue(selected, msg="Stream not selected.") if select_all_fields: # Verify all fields within each selected stream are selected - for field, field_props in catalog_entry.get('annotated-schema').get('properties').items(): - field_selected = field_props.get('selected') - LOGGER.info(f"\tValidating selection on {cat['stream_name']}.{field}: {field_selected}") + for field, field_props in ( + catalog_entry.get("annotated-schema").get("properties").items() + ): + field_selected = field_props.get("selected") + LOGGER.info( + f"\tValidating selection on {cat['stream_name']}.{field}: {field_selected}" + ) self.assertTrue(field_selected, msg="Field not selected.") else: # Verify only automatic fields are selected expected_automatic_fields = self.expected_automatic_fields().get( - cat['stream_name']) + cat["stream_name"] + ) selected_fields = self.get_selected_fields_from_metadata( - catalog_entry['metadata']) + catalog_entry["metadata"] + ) self.assertEqual(expected_automatic_fields, selected_fields) def get_selected_fields_from_metadata(self, metadata): selected_fields = set() for field in metadata: - is_field_metadata = len(field['breadcrumb']) > 1 + is_field_metadata = len(field["breadcrumb"]) > 1 inclusion_automatic_or_selected = ( - field['metadata']['selected'] is True or - field['metadata']['inclusion'] == 'automatic' + field["metadata"]["selected"] is True or + field["metadata"]["inclusion"] == "automatic" ) if is_field_metadata and inclusion_automatic_or_selected: - selected_fields.add(field['breadcrumb'][1]) + selected_fields.add(field["breadcrumb"][1]) return selected_fields - def select_all_streams_and_fields(self, conn_id, catalogs, select_all_fields: bool = True): - """Select all streams and all fields within streams""" + def select_all_streams_and_fields( + self, conn_id, catalogs, select_all_fields: bool = True + ): + """Select all streams and all fields within streams.""" for catalog in catalogs: - schema = menagerie.get_annotated_schema( - conn_id, catalog['stream_id']) + schema = menagerie.get_annotated_schema(conn_id, catalog["stream_id"]) non_selected_properties = [] if not select_all_fields: # get a list of all properties so that none are selected - non_selected_properties = schema.get('annotated-schema', {}).get( - 'properties', {}).keys() + non_selected_properties = ( + schema.get("annotated-schema", {}).get("properties", {}).keys() + ) connections.select_catalog_and_fields_via_metadata( - conn_id, catalog, schema, [], non_selected_properties) + conn_id, catalog, schema, [], non_selected_properties + ) def parse_date(self, date_value): """ @@ -308,7 +335,7 @@ def parse_date(self, date_value): "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%f+00:00", "%Y-%m-%dT%H:%M:%S+00:00", - "%Y-%m-%d" + "%Y-%m-%d", } for date_format in date_formats: try: @@ -318,22 +345,31 @@ def parse_date(self, date_value): continue raise NotImplementedError( - "Tests do not account for dates of this format: {}".format(date_value)) + f"Tests do not account for dates of this format: {date_value}" + ) def calculated_states_by_stream(self, current_state): - timedelta_by_stream = {stream: [1,0,0] # {stream_name: [days, hours, minutes], ...} - for stream in self.expected_streams()} + timedelta_by_stream = { + stream: [1, 0, 0] # {stream_name: [days, hours, minutes], ...} + for stream in self.expected_streams() + } - stream_to_calculated_state = {stream: "" for stream in current_state['bookmarks'].keys()} - for stream, state in current_state['bookmarks'].items(): + stream_to_calculated_state = { + stream: "" for stream in current_state["bookmarks"].keys() + } + for stream, state in current_state["bookmarks"].items(): state_as_datetime = dateutil.parser.parse(state) days, hours, minutes = timedelta_by_stream[stream] - calculated_state_as_datetime = state_as_datetime - timedelta(days=days, hours=hours, minutes=minutes) + calculated_state_as_datetime = state_as_datetime - timedelta( + days=days, hours=hours, minutes=minutes + ) - state_format = '%Y-%m-%dT%H:%M:%S-00:00' - calculated_state_formatted = dt.strftime(calculated_state_as_datetime, state_format) + state_format = "%Y-%m-%dT%H:%M:%S-00:00" + calculated_state_formatted = dt.strftime( + calculated_state_as_datetime, state_format + ) stream_to_calculated_state[stream] = calculated_state_formatted @@ -347,7 +383,7 @@ def convert_state_to_utc(self, date_str): """ Convert a saved bookmark value of the form '2020-08-25T13:17:36-07:00' to a string formatted utc datetime, - in order to compare aginast json formatted datetime values + in order to compare against json formatted datetime values. """ date_object = dateutil.parser.parse(date_str) date_object_utc = date_object.astimezone(tz=pytz.UTC) @@ -362,14 +398,20 @@ def timedelta_formatted(self, dtime, days=0): except ValueError: try: - date_stripped = dt.strptime( - dtime, self.BOOKMARK_COMPARISON_FORMAT) + date_stripped = dt.strptime(dtime, self.BOOKMARK_COMPARISON_FORMAT) return_date = date_stripped + timedelta(days=days) return dt.strftime(return_date, self.BOOKMARK_COMPARISON_FORMAT) except ValueError: - return Exception("Datetime object is not of the format: {}".format(self.START_DATE_FORMAT)) + return Exception( + "Datetime object is not of the format: {}".format( + self.START_DATE_FORMAT + ) + ) def is_incremental(self, stream): - return self.expected_metadata().get(stream).get(self.REPLICATION_METHOD) == self.INCREMENTAL + return ( + self.expected_metadata().get(stream).get(self.REPLICATION_METHOD) + == self.INCREMENTAL + ) diff --git a/tests/tap_tester/test_all_fields_pagination.py b/tests/tap_tester/test_all_fields_pagination.py deleted file mode 100644 index 31ffe79..0000000 --- a/tests/tap_tester/test_all_fields_pagination.py +++ /dev/null @@ -1,163 +0,0 @@ -from math import ceil - -import tap_tester.connections as connections -import tap_tester.runner as runner -import tap_tester.menagerie as menagerie -from tap_tester.logger import LOGGER - -from base import TestMixPanelBase - - -class MixPanelPaginationAllFieldsTest(TestMixPanelBase): - - @staticmethod - def name(): - return "mixpanel_pagination_all_fields_test" - - def pagination_test_run(self): - """ - All Fields Test - • and that when all fields are selected more than the automatic fields are replicated. - • Verify no unexpected streams were replicated - • Verify that more than just the automatic fields are replicated for each stream. - • verify all fields for each stream are replicated - • verify that the automatic fields are sent to the target - - - Pagination Test - • Verify that for each stream you can get multiple pages of data - • Verify no duplicate pages are replicated - • Verify no unexpected streams were replicated - - PREREQUISITE - For EACH stream add enough data that you surpass the limit of a single - fetch of data. For instance if you have a limit of 250 records ensure - that 251 (or more) records have been posted for that stream. - """ - - # Only following below 2 streams support pagination - streams_to_test_all_fields = self.expected_streams() - streams_to_test_pagination = {'engage', 'cohort_members'} - - expected_automatic_fields = self.expected_automatic_fields() - conn_id = connections.ensure_connection(self) - - found_catalogs = self.run_and_verify_check_mode(conn_id) - - # table and field selection - test_catalogs_all_fields = [catalog for catalog in found_catalogs - if catalog.get('tap_stream_id') in streams_to_test_all_fields] - - self.perform_and_verify_table_and_field_selection( - conn_id, test_catalogs_all_fields) - - # grab metadata after performing table-and-field selection to set expectations - # used for asserting all fields are replicated - stream_to_all_catalog_fields = dict() - for catalog in test_catalogs_all_fields: - stream_id, stream_name = catalog['stream_id'], catalog['stream_name'] - catalog_entry = menagerie.get_annotated_schema(conn_id, stream_id) - fields_from_field_level_md = [md_entry['breadcrumb'][1] - for md_entry in catalog_entry['metadata'] - if md_entry['breadcrumb'] != []] - stream_to_all_catalog_fields[stream_name] = set( - fields_from_field_level_md) - - record_count_by_stream = self.run_and_verify_sync(conn_id) - - actual_fields_by_stream = runner.examine_target_output_for_fields() - - synced_records = runner.get_records_from_target_output() - - # Verify no unexpected streams were replicated - synced_stream_names = set(synced_records.keys()) - self.assertSetEqual(streams_to_test_all_fields, synced_stream_names) - - # All Fields Test - for stream in streams_to_test_all_fields: - with self.subTest(logging="Primary Functional Test", stream=stream): - - # expected values - expected_all_keys = stream_to_all_catalog_fields[stream] - expected_automatic_keys = expected_automatic_fields.get( - stream, set()) - - # collect actual values - messages = synced_records.get(stream) - actual_all_keys = set() - for message in messages['messages']: - if message['action'] == 'upsert': - actual_all_keys.update(set(message['data'].keys())) - - # verify that the automatic fields are sent to the target - self.assertTrue( - actual_fields_by_stream.get(stream, set()).issuperset( - expected_automatic_keys), - msg="The fields sent to the target don't include all automatic fields") - - # Verify that more than just the automatic fields are replicated for each stream. - if stream != "cohort_members": # cohort_member has just 2 key and both are automatic - self.assertGreater(len(expected_all_keys), - len(expected_automatic_keys)) - - self.assertTrue(expected_automatic_keys.issubset( - expected_all_keys), msg=f'{expected_automatic_keys-expected_all_keys} is not in "expected_all_keys"') - - # As we can't find the below fields in the docs and also - # it won't be generated by mixpanel APIs now so expected. - if stream == "export": - expected_all_keys = expected_all_keys - {'labels', 'sampling_factor', 'dataset', 'mp_reserved_duration_s', 'mp_reserved_origin_end', - 'mp_reserved_origin_start', 'mp_reserved_event_count'} - - # verify all fields for each stream are replicated - if not stream == "engage": #Skip engage as it return records in random manner with dynamic fields. - self.assertSetEqual(expected_all_keys, actual_all_keys) - - # Pagination Test - for stream in streams_to_test_pagination: - with self.subTest(stream=stream): - - # expected values - expected_primary_keys = self.expected_pks()[stream] - - # collect actual values - messages = synced_records.get(stream) - primary_keys_list = [tuple([message['data'][expected_pk] for expected_pk in expected_primary_keys]) - for message in messages['messages'] if message['action'] == 'upsert'] - - # verify that we can paginate with all fields selected - record_count_sync = record_count_by_stream.get(stream, 0) - self.assertGreater(record_count_sync, self.API_LIMIT, - msg="The number of records is not over the stream max limit") - - - # Chunk the replicated records (just primary keys) into expected pages - pages = [] - page_count = ceil(len(primary_keys_list) / self.API_LIMIT) - page_size = self.API_LIMIT - for page_index in range(page_count): - page_start = page_index * page_size - page_end = (page_index + 1) * page_size - pages.append(set(primary_keys_list[page_start:page_end])) - - # Verify by primary keys that data is unique for each page - for current_index, current_page in enumerate(pages): - with self.subTest(current_page_primary_keys=current_page): - - for other_index, other_page in enumerate(pages): - if current_index == other_index: - continue # don't compare the page to itself - - self.assertTrue( - current_page.isdisjoint(other_page), msg=f'other_page_primary_keys={other_page}' - ) - - def test_run(self): - # Pagination test for standard server - self.eu_residency = False - self.pagination_test_run() - - - # Pagination test for EU residency server - self.eu_residency = True - self.pagination_test_run() diff --git a/tests/tap_tester/test_discovery.py b/tests/tap_tester/test_discovery.py deleted file mode 100644 index 5442212..0000000 --- a/tests/tap_tester/test_discovery.py +++ /dev/null @@ -1,142 +0,0 @@ -import re -from tap_tester import menagerie, connections -from tap_tester.logger import LOGGER - -from base import TestMixPanelBase - -class MixPanelDiscoverTest(TestMixPanelBase): - """ - Testing that discovery creates the appropriate catalog with valid metadata. - • Verify number of actual streams discovered match expected - • Verify the stream names discovered were what we expect - • Verify stream names follow naming convention - streams should only have lowercase alphas and underscores - • verify there is only 1 top level breadcrumb - • verify replication key(s) - • verify primary key(s) - • verify that if there is a replication key we are doing INCREMENTAL otherwise FULL - • verify the actual replication matches our expected replication method - • verify that primary, replication and foreign keys - are given the inclusion of automatic. - • verify that all other fields have inclusion of available metadata. - """ - - @staticmethod - def name(): - return "mix_panel_discover_test" - - def discovery_test_run(self): - - region = "EU" if self.eu_residency else "Standard" - LOGGER.info(f"Testing against {region} account.") - - self.assertion_logging_enabled = True - - streams_to_test = self.expected_streams() - - conn_id = connections.ensure_connection(self, payload_hook=None) - - # Verify that there are catalogs found - found_catalogs = self.run_and_verify_check_mode(conn_id) - - # Verify stream names follow naming convention - # streams should only have lowercase alphas and underscores - found_catalog_names = {c['tap_stream_id'] for c in found_catalogs} - self.assertTrue(all([re.fullmatch(r"[a-z_]+", name) for name in found_catalog_names]), - logging="asserting all streams defined in catalog follow the naming convention '[a-z_]+'") - - for stream in streams_to_test: - with self.subTest(stream=stream): - - # Verify the caatalog is found for a given stream - catalog = next(iter([catalog for catalog in found_catalogs - if catalog["stream_name"] == stream])) - self.assertIsNotNone(catalog, logging="asserting entry is present in catalog") - - # collecting expected values - expected_primary_keys = self.expected_pks()[stream] - expected_replication_keys = self.expected_replication_keys()[ - stream] - expected_automatic_fields = self.expected_automatic_fields().get(stream) - expected_replication_method = self.expected_replication_method()[ - stream] - - # collecting actual values... - schema_and_metadata = menagerie.get_annotated_schema( - conn_id, catalog['stream_id']) - metadata = schema_and_metadata["metadata"] - stream_properties = [ - item for item in metadata if item.get("breadcrumb") == []] - actual_primary_keys = set( - stream_properties[0].get( - "metadata", {self.PRIMARY_KEYS: []}).get(self.PRIMARY_KEYS, []) - ) - actual_replication_keys = set( - stream_properties[0].get( - "metadata", {self.REPLICATION_KEYS: []}).get(self.REPLICATION_KEYS, []) - ) - actual_replication_method = stream_properties[0].get( - "metadata", {self.REPLICATION_METHOD: None}).get(self.REPLICATION_METHOD) - actual_automatic_fields = set( - item.get("breadcrumb", ["properties", None])[1] for item in metadata - if item.get("metadata").get("inclusion") == "automatic" - ) - - ########################################################################## - # metadata assertions - ########################################################################## - - # verify there is only 1 top level breadcrumb in metadata - self.assertEqual(len(stream_properties), 1, - logging='asserting there is only 1 top level breadcrumb in metadata') - - # verify that if there is a replication key we are doing INCREMENTAL otherwise FULL - if actual_replication_keys: - self.assertEqual( - actual_replication_method, self.INCREMENTAL, - logging=f"asserting replication method is {self.INCREMENTAL} when replication keys are defined" - ) - else: - self.assertEqual( - actual_replication_method, self.FULL_TABLE, - logging=f"asserting replication method is {self.FULL_TABLE} when replication keys are not defined" - ) - - # verify the actual replication matches our expected replication method - self.assertEqual(expected_replication_method, actual_replication_method, - logging=f"asserting replication method is {expected_replication_method}") - - # verify replication key(s) - self.assertEqual(expected_replication_keys, actual_replication_keys, - logging=f"asserting replication keys are {expected_replication_keys}") - - - # verify primary key(s) match expectations - self.assertSetEqual(expected_primary_keys, actual_primary_keys, - logging=f"asserting primary keys are {expected_primary_keys}") - - # verify that primary keys and replication keys - # are given the inclusion of automatic in metadata. - self.assertSetEqual(expected_automatic_fields, actual_automatic_fields, - logging=f"asserting primary and replication keys {expected_automatic_fields} are automatic") - - # verify that all other fields have inclusion of available - # This assumes there are no unsupported fields for SaaS sources - self.assertTrue( - all({item.get("metadata").get("inclusion") == "available" - for item in metadata - if item.get("breadcrumb", []) != [] - and item.get("breadcrumb", ["properties", None])[1] - not in actual_automatic_fields}), - logging=f"asserting non-key-property fields are available for field selection") - - - def test_standard_discovery(self): - """Discovery test for standard server""" - self.eu_residency = False - self.discovery_test_run() - - def test_eu_discovery(self): - """Discovery test for EU recidency server""" - self.eu_residency = True - self.discovery_test_run() diff --git a/tests/tap_tester/test_mixpanel_all_fields.py b/tests/tap_tester/test_mixpanel_all_fields.py new file mode 100644 index 0000000..84f0ee2 --- /dev/null +++ b/tests/tap_tester/test_mixpanel_all_fields.py @@ -0,0 +1,125 @@ +from tap_tester import runner, connections, menagerie + +from base import TestMixPanelBase + + +class MixPanelAllFieldsTest(TestMixPanelBase): + + @staticmethod + def name(): + return "tap_tester_mixpanel_all_fields_test" + + def all_fields_test(self): + """ + All Fields Test. + + • Verify that when all fields are selected more than the automatic fields are replicated. + • Verify no unexpected streams were replicated + • Verify that more than just the automatic fields are replicated for each stream. + • Verify all fields for each stream are replicated + • Verify that the automatic fields are sent to the target + """ + + # Only following below 2 streams support pagination + expected_streams = self.expected_streams() + + expected_automatic_fields = self.expected_automatic_fields() + conn_id = connections.ensure_connection(self) + + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # Table and field selection + test_catalogs_all_fields = [ + catalog + for catalog in found_catalogs + if catalog.get("tap_stream_id") in expected_streams + ] + + self.perform_and_verify_table_and_field_selection( + conn_id, test_catalogs_all_fields + ) + + # Grab metadata after performing table-and-field selection to set expectations + # used for asserting all fields are replicated + stream_to_all_catalog_fields = dict() + for catalog in test_catalogs_all_fields: + stream_id, stream_name = catalog["stream_id"], catalog["stream_name"] + catalog_entry = menagerie.get_annotated_schema(conn_id, stream_id) + fields_from_field_level_md = [ + md_entry["breadcrumb"][1] + for md_entry in catalog_entry["metadata"] + if md_entry["breadcrumb"] != [] + ] + stream_to_all_catalog_fields[stream_name] = set(fields_from_field_level_md) + + self.run_and_verify_sync(conn_id) + + actual_fields_by_stream = runner.examine_target_output_for_fields() + + synced_records = runner.get_records_from_target_output() + + # Verify no unexpected streams were replicated + synced_stream_names = set(synced_records.keys()) + self.assertSetEqual(expected_streams, synced_stream_names) + + # All Fields Test + for stream in expected_streams: + with self.subTest(logging="Primary Functional Test", stream=stream): + + # Expected values + expected_all_keys = stream_to_all_catalog_fields[stream] + expected_automatic_keys = expected_automatic_fields.get(stream, set()) + + # Collect actual values + messages = synced_records.get(stream) + actual_all_keys = set() + for message in messages["messages"]: + if message["action"] == "upsert": + actual_all_keys.update(set(message["data"].keys())) + + # Verify that the automatic fields are sent to the target + self.assertTrue( + actual_fields_by_stream.get(stream, set()).issuperset( + expected_automatic_keys + ), + msg="The fields sent to the target don't include all automatic fields", + ) + + # Verify that more than just the automatic fields are replicated for each stream. + # 'cohort_members' has just 2 key and both are automatic + if stream != "cohort_members": + self.assertGreater( + len(expected_all_keys), len(expected_automatic_keys) + ) + + self.assertTrue( + expected_automatic_keys.issubset(expected_all_keys), + msg=f'{expected_automatic_keys-expected_all_keys} is not in "expected_all_keys"', + ) + + # As we can't find the below fields in the docs and also + # it won't be generated by mixpanel APIs now so expected. + if stream == "export": + expected_all_keys = expected_all_keys - { + "labels", + "sampling_factor", + "dataset", + "mp_reserved_duration_s", + "mp_reserved_origin_end", + "mp_reserved_origin_start", + "mp_reserved_event_count", + } + + # Verify all fields for each stream are replicated. + # Skip engage as it return records in random manner with dynamic fields. + if not stream == "engage": + self.assertSetEqual(expected_all_keys, actual_all_keys) + + def test_run(self): + # Pagination test for standard server + self.eu_residency = False + self.all_fields_test() + + # Pagination test for EU residency server + self.eu_residency = True + self.all_fields_test() diff --git a/tests/tap_tester/test_automatic_fields.py b/tests/tap_tester/test_mixpanel_automatic_fields.py similarity index 67% rename from tests/tap_tester/test_automatic_fields.py rename to tests/tap_tester/test_mixpanel_automatic_fields.py index 7017942..c04195b 100644 --- a/tests/tap_tester/test_automatic_fields.py +++ b/tests/tap_tester/test_mixpanel_automatic_fields.py @@ -1,32 +1,38 @@ -import tap_tester.connections as connections -import tap_tester.runner as runner +from tap_tester import connections, runner + from base import TestMixPanelBase class MixPanelAutomaticFieldsTest(TestMixPanelBase): """ - Ensure running the tap with all streams selected and all fields deselected results in the replication of just the + Ensure running the tap with all streams selected and all fields + deselected results in the replication of just the primary keys and replication keys (automatic fields). """ @staticmethod def name(): - return "mix_panel_automatic_fields_test" + return "tap_tester_mixpanel_automatic_fields_test" def automatic_fields_test_run(self): """ - Verify that for each stream you can get enough data - when no fields are selected and only the automatic fields are replicated. + • Verify we can deselect all fields except when inclusion=automatic, + which is handled by base.py methods + • Verify that only the automatic fields are sent to the target. + • Verify that all replicated records have unique primary key values. """ - streams_to_test = self.expected_streams() + expected_streams = self.expected_streams() conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) - # table and field selection - test_catalogs_automatic_fields = [catalog for catalog in found_catalogs - if catalog.get('tap_stream_id') in streams_to_test] + # Table and field selection + test_catalogs_automatic_fields = [ + catalog + for catalog in found_catalogs + if catalog.get("tap_stream_id") in expected_streams + ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs_automatic_fields, select_all_fields=False) @@ -34,13 +40,13 @@ def automatic_fields_test_run(self): record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() - for stream in streams_to_test: + for stream in expected_streams: with self.subTest(stream=stream): - # expected values + # Expected values expected_keys = self.expected_automatic_fields().get(stream) - # collect actual values + # Collect actual values data = synced_records.get(stream, {}) record_messages_keys = [set(row['data'].keys()) for row in data.get('messages', [])] @@ -55,13 +61,12 @@ def automatic_fields_test_run(self): for actual_keys in record_messages_keys: self.assertSetEqual(expected_keys, actual_keys) - def test_standard_auto_fields(self): """Automatic fields test for standard server""" self.eu_residency = False self.automatic_fields_test_run() def test_eu_auto_fields(self): - """Automatic fields test for EU recidency server""" + """Automatic fields test for EU residency server""" self.eu_residency = True self.automatic_fields_test_run() diff --git a/tests/tap_tester/test_bookmark.py b/tests/tap_tester/test_mixpanel_bookmark.py similarity index 69% rename from tests/tap_tester/test_bookmark.py rename to tests/tap_tester/test_mixpanel_bookmark.py index f6d0b10..83989eb 100644 --- a/tests/tap_tester/test_bookmark.py +++ b/tests/tap_tester/test_mixpanel_bookmark.py @@ -1,7 +1,7 @@ -import tap_tester.connections as connections -import tap_tester.runner as runner +from tap_tester import menagerie, connections, runner + from base import TestMixPanelBase -from tap_tester import menagerie + class MixPanelBookMarkTest(TestMixPanelBase): @@ -9,7 +9,7 @@ class MixPanelBookMarkTest(TestMixPanelBase): @staticmethod def name(): - return "mix_panel_bookmark_test" + return "tap_tester_mixpanel_bookmark_test" def bookmark_test_run(self): """ @@ -40,9 +40,12 @@ def bookmark_test_run(self): # Run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) - # table and field selection - catalog_entries = [catalog for catalog in found_catalogs - if catalog.get('tap_stream_id') in expected_streams] + # Table and field selection + catalog_entries = [ + catalog + for catalog in found_catalogs + if catalog.get("tap_stream_id") in expected_streams + ] self.perform_and_verify_table_and_field_selection( conn_id, catalog_entries) @@ -56,11 +59,10 @@ def bookmark_test_run(self): # Update State Between Syncs ########################################################################## - new_states = {'bookmarks': dict()} - simulated_states = self.calculated_states_by_stream( - first_sync_bookmarks) + new_states = {"bookmarks": dict()} + simulated_states = self.calculated_states_by_stream(first_sync_bookmarks) for stream, new_state in simulated_states.items(): - new_states['bookmarks'][stream] = new_state + new_states["bookmarks"][stream] = new_state menagerie.set_state(conn_id, new_states) ########################################################################## @@ -78,38 +80,43 @@ def bookmark_test_run(self): for stream in expected_streams: with self.subTest(stream=stream): - # expected values + # Expected values expected_replication_method = expected_replication_methods[stream] - # collect information for assertions from syncs 1 & 2 base on expected values + # Collect information for assertions from syncs 1 & 2 base on expected values first_sync_count = first_sync_record_count.get(stream, 0) second_sync_count = second_sync_record_count.get(stream, 0) - first_sync_messages = [record.get('data') for record in - first_sync_records.get( - stream, {}).get('messages', []) - if record.get('action') == 'upsert'] - second_sync_messages = [record.get('data') for record in - second_sync_records.get( - stream, {}).get('messages', []) - if record.get('action') == 'upsert'] + first_sync_messages = [ + record.get("data") + for record in first_sync_records.get(stream, {}).get("messages", []) + if record.get("action") == "upsert" + ] + second_sync_messages = [ + record.get("data") + for record in second_sync_records.get(stream, {}).get( + "messages", [] + ) + if record.get("action") == "upsert" + ] first_bookmark_value = first_sync_bookmarks.get( - 'bookmarks', {stream: None}).get(stream) + "bookmarks", {stream: None} + ).get(stream) second_bookmark_value = second_sync_bookmarks.get( - 'bookmarks', {stream: None}).get(stream) + "bookmarks", {stream: None} + ).get(stream) if expected_replication_method == self.INCREMENTAL: - # collect information specific to incremental streams from syncs 1 & 2 - replication_key = next( - iter(expected_replication_keys[stream])) + # Collect information specific to incremental streams from syncs 1 & 2 + replication_key = next(iter(expected_replication_keys[stream])) first_bookmark_value_utc = self.convert_state_to_utc( first_bookmark_value) second_bookmark_value_utc = self.convert_state_to_utc( second_bookmark_value) - simulated_bookmark = new_states['bookmarks'][stream] + simulated_bookmark = new_states["bookmarks"][stream] # Verify the first sync sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_value) @@ -119,35 +126,42 @@ def bookmark_test_run(self): # Verify the second sync bookmark is Equal to the first sync bookmark # assumes no changes to data during test - self.assertEqual(second_bookmark_value, - first_bookmark_value) + self.assertEqual(second_bookmark_value, first_bookmark_value) for record in first_sync_messages: # Verify the first sync bookmark value is the max replication key value for a given stream replication_key_value = record.get(replication_key) self.assertLessEqual( - replication_key_value, first_bookmark_value_utc, - msg="First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." + replication_key_value, + first_bookmark_value_utc, + msg="First sync bookmark was set incorrectly," + "a record with a greater replication-key value was synced.", ) for record in second_sync_messages: # Verify the second sync replication key value is Greater or Equal to the first sync bookmark replication_key_value = record.get(replication_key) - self.assertGreaterEqual(replication_key_value, simulated_bookmark, - msg="Second sync records do not repect the previous bookmark.") + self.assertGreaterEqual( + replication_key_value, + simulated_bookmark, + msg="Second sync records do not respect the previous bookmark.", + ) # Verify the second sync bookmark value is the max replication key value for a given stream self.assertLessEqual( - replication_key_value, second_bookmark_value_utc, - msg="Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." + replication_key_value, + second_bookmark_value_utc, + msg="Second sync bookmark was set incorrectly," + " a record with a greater replication-key value was synced.", ) - # verify that you get less data the 2nd time around + # Verify that you get less data the 2nd time around self.assertLess( second_sync_count, first_sync_count, - msg="second syc didn't have less records, bookmark usage not verified") + msg="Second syc didn't have less records, bookmark usage not verified", + ) elif expected_replication_method == self.FULL_TABLE: @@ -162,19 +176,23 @@ def bookmark_test_run(self): raise NotImplementedError( "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}".format( - stream, expected_replication_method) + stream, expected_replication_method + ) ) # Verify at least 1 record was replicated in the second sync self.assertGreater( - second_sync_count, 0, msg="We are not fully testing bookmarking for {}".format(stream)) + second_sync_count, + 0, + msg=f"We are not fully testing bookmarking for {stream}", + ) def test_standard_bookmarks(self): - """Bookmark test for standard server""" + """Bookmark test for standard server.""" self.eu_residency = False self.bookmark_test_run() def test_eu_bookmarks(self): - """Bookmark test for EU recidency server""" + """Bookmark test for EU residency server.""" self.eu_residency = True self.bookmark_test_run() diff --git a/tests/tap_tester/test_mixpanel_discovery.py b/tests/tap_tester/test_mixpanel_discovery.py new file mode 100644 index 0000000..b578dac --- /dev/null +++ b/tests/tap_tester/test_mixpanel_discovery.py @@ -0,0 +1,190 @@ +import re +from tap_tester import menagerie, connections, LOGGER + +from base import TestMixPanelBase + +class MixPanelDiscoverTest(TestMixPanelBase): + """ + Testing that discovery creates the appropriate catalog with valid metadata. + • Verify number of actual streams discovered match expected + • Verify the stream names discovered were what we expect + • Verify stream names follow naming convention + streams should only have lowercase alphas and underscores + • Verify there is only 1 top level breadcrumb + • Verify replication key(s) + • Verify primary key(s) + • Verify that if there is a replication key we are doing INCREMENTAL otherwise FULL + • Verify the actual replication matches our expected replication method + • Verify that primary, replication and foreign keys + are given the inclusion of automatic. + • Verify that all other fields have inclusion of available metadata. + """ + + @staticmethod + def name(): + return "tap_tester_mixpanel_discover_test" + + def discovery_test_run(self): + + region = "EU" if self.eu_residency else "Standard" + LOGGER.info(f"Testing against {region} account.") + + self.assertion_logging_enabled = True + + streams_to_test = self.expected_streams() + + conn_id = connections.ensure_connection(self, payload_hook=None) + + # Verify that there are catalogs found + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # Verify stream names follow naming convention + # streams should only have lowercase alphas and underscores + found_catalog_names = {c["tap_stream_id"] for c in found_catalogs} + self.assertTrue( + all([re.fullmatch(r"[a-z_]+", name) for name in found_catalog_names]), + logging="asserting all streams defined in catalog follow the naming convention '[a-z_]+'", + ) + + for stream in streams_to_test: + with self.subTest(stream=stream): + + # Verify the catalog is found for a given stream + catalog = next( + iter( + [ + catalog + for catalog in found_catalogs + if catalog["stream_name"] == stream + ] + ) + ) + self.assertIsNotNone( + catalog, logging="Asserting entry is present in catalog" + ) + + # Collecting expected values + expected_primary_keys = self.expected_pks()[stream] + expected_replication_keys = self.expected_replication_keys()[stream] + expected_automatic_fields = self.expected_automatic_fields().get(stream) + expected_replication_method = self.expected_replication_method()[stream] + + # Collecting actual values... + schema_and_metadata = menagerie.get_annotated_schema( + conn_id, catalog["stream_id"] + ) + metadata = schema_and_metadata["metadata"] + stream_properties = [ + item for item in metadata if item.get("breadcrumb") == [] + ] + actual_primary_keys = set( + stream_properties[0] + .get("metadata", {self.PRIMARY_KEYS: []}) + .get(self.PRIMARY_KEYS, []) + ) + actual_replication_keys = set( + stream_properties[0] + .get("metadata", {self.REPLICATION_KEYS: []}) + .get(self.REPLICATION_KEYS, []) + ) + actual_replication_method = ( + stream_properties[0] + .get("metadata", {self.REPLICATION_METHOD: None}) + .get(self.REPLICATION_METHOD) + ) + actual_automatic_fields = { + item.get("breadcrumb", ["properties", None])[1] + for item in metadata + if item.get("metadata").get("inclusion") == "automatic" + } + + actual_fields = [] + for md_entry in metadata: + if md_entry["breadcrumb"] != []: + actual_fields.append(md_entry["breadcrumb"][1]) + + ########################################################################## + # Metadata assertions + ########################################################################## + + # Verify there is only 1 top level breadcrumb in metadata + self.assertEqual( + len(stream_properties), + 1, + logging="Asserting there is only 1 top level breadcrumb in metadata", + ) + + # Verify there is no duplicate metadata entries + self.assertEqual( + len(actual_fields), + len(set(actual_fields)), + msg="Duplicates in the fields retrieved", + ) + + # Verify that if there is a replication key we are doing INCREMENTAL otherwise FULL + if actual_replication_keys: + self.assertEqual( + actual_replication_method, + self.INCREMENTAL, + logging=f"Asserting replication method is {self.INCREMENTAL} when replication keys are defined", + ) + else: + self.assertEqual( + actual_replication_method, + self.FULL_TABLE, + logging=f"Asserting replication method is {self.FULL_TABLE} when replication keys are not defined", + ) + + # Verify the actual replication matches our expected replication method + self.assertEqual( + expected_replication_method, + actual_replication_method, + logging=f"Asserting replication method is {expected_replication_method}", + ) + + # Verify replication key(s) + self.assertEqual( + expected_replication_keys, + actual_replication_keys, + logging=f"asserting replication keys are {expected_replication_keys}", + ) + + # Verify primary key(s) match expectations + self.assertSetEqual( + expected_primary_keys, + actual_primary_keys, + logging=f"asserting primary keys are {expected_primary_keys}", + ) + + # Verify that primary keys and replication keys + # are given the inclusion of automatic in metadata. + self.assertSetEqual( + expected_automatic_fields, + actual_automatic_fields, + logging=f"asserting primary and replication keys {expected_automatic_fields} are automatic", + ) + + # Verify that all other fields have inclusion of available. + # This assumes there are no unsupported fields for SaaS sources + self.assertTrue( + all( + { + item.get("metadata").get("inclusion") == "available" + for item in metadata + if item.get("breadcrumb", []) != [] + and item.get("breadcrumb", ["properties", None])[1] + not in actual_automatic_fields + } + ), + logging="Asserting non-key-property fields are available for field selection", + ) + + def test_standard_discovery(self): + """Discovery test for standard server.""" + self.eu_residency = False + self.discovery_test_run() + + def test_eu_discovery(self): + """Discovery test for EU residency server.""" + self.eu_residency = True + self.discovery_test_run() diff --git a/tests/tap_tester/test_mixpanel_pagination.py b/tests/tap_tester/test_mixpanel_pagination.py new file mode 100644 index 0000000..432ca19 --- /dev/null +++ b/tests/tap_tester/test_mixpanel_pagination.py @@ -0,0 +1,107 @@ +from math import ceil + +from tap_tester import connections, runner + +from base import TestMixPanelBase + + +class MixPanelPaginationTest(TestMixPanelBase): + + @staticmethod + def name(): + return "tap_tester_mixpanel_pagination_test" + + def pagination_test_run(self): + """ + Pagination Test + • Verify that for each stream you can get multiple pages of data + • Verify no duplicate pages are replicated + • Verify no unexpected streams were replicated + + PREREQUISITE + For EACH stream add enough data that you surpass the limit of a single + fetch of data. For instance if you have a limit of 250 records ensure + that 251 (or more) records have been posted for that stream. + """ + + # Only following below 2 streams support pagination + expected_streams = {"engage", "cohort_members"} + + conn_id = connections.ensure_connection(self) + + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # Table and field selection + test_catalogs_all_fields = [ + catalog + for catalog in found_catalogs + if catalog.get("tap_stream_id") in expected_streams + ] + + self.perform_and_verify_table_and_field_selection( + conn_id, test_catalogs_all_fields + ) + + record_count_by_stream = self.run_and_verify_sync(conn_id) + + synced_records = runner.get_records_from_target_output() + + # Verify no unexpected streams were replicated + synced_stream_names = set(synced_records.keys()) + self.assertSetEqual(expected_streams, synced_stream_names) + + for stream in expected_streams: + with self.subTest(stream=stream): + + # Expected values + expected_primary_keys = self.expected_pks()[stream] + + # Collect actual values + messages = synced_records.get(stream) + primary_keys_list = [ + tuple( + message["data"][expected_pk] + for expected_pk in expected_primary_keys + ) + for message in messages["messages"] + if message["action"] == "upsert" + ] + + # Verify that we can paginate with all fields selected + record_count_sync = record_count_by_stream.get(stream, 0) + self.assertGreater( + record_count_sync, + self.API_LIMIT, + msg="The number of records is not over the stream max limit", + ) + + # Chunk the replicated records (just primary keys) into expected pages + pages = [] + page_count = ceil(len(primary_keys_list) / self.API_LIMIT) + page_size = self.API_LIMIT + for page_index in range(page_count): + page_start = page_index * page_size + page_end = (page_index + 1) * page_size + pages.append(set(primary_keys_list[page_start:page_end])) + + # Verify by primary keys that data is unique for each page + for current_index, current_page in enumerate(pages): + with self.subTest(current_page_primary_keys=current_page): + + for other_index, other_page in enumerate(pages): + if current_index == other_index: + continue # Don't compare the page to itself + + self.assertTrue( + current_page.isdisjoint(other_page), + msg=f"other_page_primary_keys={other_page}", + ) + + def test_run(self): + # Pagination test for standard server + self.eu_residency = False + self.pagination_test_run() + + # Pagination test for EU residency server + self.eu_residency = True + self.pagination_test_run() diff --git a/tests/tap_tester/test_start_date.py b/tests/tap_tester/test_mixpanel_start_date.py similarity index 50% rename from tests/tap_tester/test_start_date.py rename to tests/tap_tester/test_mixpanel_start_date.py index 560593a..8a5018b 100644 --- a/tests/tap_tester/test_start_date.py +++ b/tests/tap_tester/test_mixpanel_start_date.py @@ -1,6 +1,5 @@ -import tap_tester.connections as connections -import tap_tester.runner as runner from base import TestMixPanelBase +from tap_tester import connections, runner, LOGGER class MixPanelStartDateTest(TestMixPanelBase): @@ -9,12 +8,12 @@ class MixPanelStartDateTest(TestMixPanelBase): @staticmethod def name(): - return "mix_panel_start_date_test" + return "tap_tester_mixpanel_start_date_test" def start_date_test_run(self): - """Instantiate start date according to the desired data set and run the test""" + """Instantiate start date according to the desired data set and run the test.""" - self.start_date_1 = self.get_properties().get('start_date') + self.start_date_1 = self.get_properties().get("start_date") self.start_date_2 = self.timedelta_formatted(self.start_date_1, days=15) self.start_date = self.start_date_1 @@ -25,19 +24,23 @@ def start_date_test_run(self): # First Sync ########################################################################## - # instantiate connection + # Instantiate connection conn_id_1 = connections.ensure_connection(self) - # run check mode + # Run check mode found_catalogs_1 = self.run_and_verify_check_mode(conn_id_1) - # table and field selection - test_catalogs_1_all_fields = [catalog for catalog in found_catalogs_1 - if catalog.get('tap_stream_id') in expected_streams] + # Table and field selection + test_catalogs_1_all_fields = [ + catalog + for catalog in found_catalogs_1 + if catalog.get("tap_stream_id") in expected_streams + ] self.perform_and_verify_table_and_field_selection( - conn_id_1, test_catalogs_1_all_fields, select_all_fields=True) + conn_id_1, test_catalogs_1_all_fields, select_all_fields=True + ) - # run initial sync + # Run initial sync record_count_by_stream_1 = self.run_and_verify_sync(conn_id_1) synced_records_1 = runner.get_records_from_target_output() @@ -45,97 +48,115 @@ def start_date_test_run(self): # Update START DATE Between Syncs ########################################################################## - print("REPLICATION START DATE CHANGE: {} ===>>> {} ".format( - self.start_date, self.start_date_2)) + LOGGER.info( + f"REPLICATION START DATE CHANGE: {self.start_date} ===>>> {self.start_date_2} " + ) self.start_date = self.start_date_2 ########################################################################## # Second Sync ########################################################################## - # create a new connection with the new start_date - conn_id_2 = connections.ensure_connection( - self, original_properties=False) + # Create a new connection with the new start_date + conn_id_2 = connections.ensure_connection(self, original_properties=False) - # run check mode + # Run check mode found_catalogs_2 = self.run_and_verify_check_mode(conn_id_2) - # table and field selection - test_catalogs_2_all_fields = [catalog for catalog in found_catalogs_2 - if catalog.get('tap_stream_id') in expected_streams] + # Table and field selection + test_catalogs_2_all_fields = [ + catalog + for catalog in found_catalogs_2 + if catalog.get("tap_stream_id") in expected_streams + ] self.perform_and_verify_table_and_field_selection( - conn_id_2, test_catalogs_2_all_fields, select_all_fields=True) + conn_id_2, test_catalogs_2_all_fields, select_all_fields=True + ) - # run sync + # Run sync record_count_by_stream_2 = self.run_and_verify_sync(conn_id_2) synced_records_2 = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): - # expected values + # Expected values expected_primary_keys = self.expected_pks()[stream] - expected_start_date_1 = self.timedelta_formatted( - self.start_date_1) - expected_start_date_2 = self.timedelta_formatted( - self.start_date_2) + expected_metadata = self.expected_metadata()[stream] + expected_start_date_1 = self.timedelta_formatted(self.start_date_1) + expected_start_date_2 = self.timedelta_formatted(self.start_date_2) - # collect information for assertions from syncs 1 & 2 base on expected values + # Collect information for assertions from syncs 1 & 2 base on expected values record_count_sync_1 = record_count_by_stream_1.get(stream, 0) record_count_sync_2 = record_count_by_stream_2.get(stream, 0) - primary_keys_list_1 = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) - for message in synced_records_1.get(stream, {}).get('messages', []) - if message.get('action') == 'upsert'] - primary_keys_list_2 = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) - for message in synced_records_2.get(stream, {}).get('messages', []) - if message.get('action') == 'upsert'] + primary_keys_list_1 = [ + tuple( + message.get("data").get(expected_pk) + for expected_pk in expected_primary_keys + ) + for message in synced_records_1.get(stream, {}).get("messages", []) + if message.get("action") == "upsert" + ] + primary_keys_list_2 = [ + tuple( + message.get("data").get(expected_pk) + for expected_pk in expected_primary_keys + ) + for message in synced_records_2.get(stream, {}).get("messages", []) + if message.get("action") == "upsert" + ] primary_keys_sync_1 = set(primary_keys_list_1) primary_keys_sync_2 = set(primary_keys_list_2) - if self.is_incremental(stream): + if expected_metadata.get(self.OBEYS_START_DATE): - # collect information specific to incremental streams from syncs 1 & 2 + # Collect information specific to incremental streams from syncs 1 & 2 expected_replication_key = next( - iter(self.expected_replication_keys().get(stream, []))) - replication_dates_1 = [row.get('data').get(expected_replication_key) for row in - synced_records_1.get( - stream, {'messages': []}).get('messages', []) - if row.get('data')] - replication_dates_2 = [row.get('data').get(expected_replication_key) for row in - synced_records_2.get( - stream, {'messages': []}).get('messages', []) - if row.get('data')] + iter(self.expected_replication_keys().get(stream, [])) + ) + replication_dates_1 = [ + row.get("data").get(expected_replication_key) + for row in synced_records_1.get(stream, {"messages": []}).get( + "messages", [] + ) + if row.get("data") + ] + replication_dates_2 = [ + row.get("data").get(expected_replication_key) + for row in synced_records_2.get(stream, {"messages": []}).get( + "messages", [] + ) + if row.get("data") + ] # # Verify replication key is greater or equal to start_date for sync 1 for replication_date in replication_dates_1: self.assertGreaterEqual( - self.parse_date(replication_date), self.parse_date( - expected_start_date_1), - msg="Report pertains to a date prior to our start date.\n" + - "Sync start_date: {}\n".format(expected_start_date_1) + - "Record date: {} ".format(replication_date) + self.parse_date(replication_date), + self.parse_date(expected_start_date_1), + msg="Report pertains to a date prior to our start date.\n" + + f"Sync start_date: {expected_start_date_1}\n" + + f"Record date: {replication_date} ", ) # Verify replication key is greater or equal to start_date for sync 2 for replication_date in replication_dates_2: self.assertGreaterEqual( - self.parse_date(replication_date), self.parse_date( - expected_start_date_2), - msg="Report pertains to a date prior to our start date.\n" + - "Sync start_date: {}\n".format(expected_start_date_2) + - "Record date: {} ".format(replication_date) + self.parse_date(replication_date), + self.parse_date(expected_start_date_2), + msg="Report pertains to a date prior to our start date.\n" + + f"Sync start_date: {expected_start_date_2}\n" + + f"Record date: {replication_date} ", ) # Verify the number of records replicated in sync 1 is greater than the number # of records replicated in sync 2 - self.assertGreater(record_count_sync_1, - record_count_sync_2) + self.assertGreater(record_count_sync_1, record_count_sync_2) # Verify the records replicated in sync 2 were also replicated in sync 1 - self.assertTrue( - primary_keys_sync_2.issubset(primary_keys_sync_1)) + self.assertTrue(primary_keys_sync_2.issubset(primary_keys_sync_1)) else: @@ -144,14 +165,13 @@ def start_date_test_run(self): self.assertEqual(record_count_sync_2, record_count_sync_1) # Verify by primary key the same records are replicated in the 1st and 2nd syncs - self.assertSetEqual(primary_keys_sync_1, - primary_keys_sync_2) + self.assertSetEqual(primary_keys_sync_1, primary_keys_sync_2) def test_run(self): - #Start date test for standard server + # Start date test for standard server self.eu_residency = False self.start_date_test_run() - #Start date test for EU recidency server + # Start date test for EU residency server self.eu_residency = True self.start_date_test_run() diff --git a/tests/unittests/test_error_handling.py b/tests/unittests/test_error_handling.py index f9776aa..d4fc94e 100644 --- a/tests/unittests/test_error_handling.py +++ b/tests/unittests/test_error_handling.py @@ -1,17 +1,20 @@ import unittest from unittest import mock +from parameterized import parameterized import requests + from tap_mixpanel import client -# mock responce +# Mock response REQUEST_TIMEOUT = 300 -class Mockresponse: - """ - Mocked standard HTTPResponse to test error handling. - """ - def __init__(self, resp, status_code, content=[""], headers=None, raise_error=False, text={}): +class MockResponse: + """Mocked standard HTTPResponse to test error handling.""" + + def __init__( + self, status_code, resp = "", content=[""], headers=None, raise_error=True, text={} + ): self.json_data = resp self.status_code = status_code self.content = content @@ -20,371 +23,182 @@ def __init__(self, resp, status_code, content=[""], headers=None, raise_error=Fa self.text = text self.reason = "error" - def prepare(self): - return (self.json_data, self.status_code, self.content, self.headers, self.raise_error) - def raise_for_status(self): + """If an error occur, this method returns a HTTPError object. + + Raises: + requests.HTTPError: Mock http error. + + Returns: + int: Returns status code if not error occurred. + """ if not self.raise_error: return self.status_code raise requests.HTTPError("mock sample message") def json(self): + """Returns a JSON object of the result.""" return self.text -# Mock response for timezone related error messages + def get_mock_http_response(content, status_code): + """Mock response for timezone related error messages. + + Args: + content (str): Returns the content of the response, in bytes. + status_code (int): Returns a number that indicates the status. + + Returns: + request.Response: Custom mock response. + """ response = requests.Response() response.status_code = status_code response.headers = {} response._content = content.encode() return response -@mock.patch('time.sleep', return_value=None) # Mock time.sleep to reduce the time + +# Mock time.sleep to reduce the time +@mock.patch("time.sleep", return_value=None) class TestMixpanelErrorHandling(unittest.TestCase): + """ + Test case to verify the custom error message and + back off is implemented for mentioned errors in tests. + """ - def mock_send_400(*args, **kwargs): - return Mockresponse("", 400, raise_error=True) + timeout_400_error = { + "request": "/api/2.0/engage/revenue?from_date=2020-02-01&to_date=2020-03-01", + "error": "Timeout Error.", + } def mock_400_different_timezone(*args, **kwargs): + """Mock 400 error response with with different timezone. + + Returns: + requests.Response: Returns mock 400 error response. + """ content = " to_date cannot be later than today" return get_mock_http_response(content, 400) - def mock_send_401(*args, **kwargs): - return Mockresponse("", 401, raise_error=True) - - def mock_send_402(*args, **kwargs): - return Mockresponse("", 402, raise_error=True) - - def mock_send_403(*args, **kwargs): - return Mockresponse("", 403, raise_error=True) - - def mock_send_404(*args, **kwargs): - return Mockresponse("", 404, raise_error=True) - - def mock_send_429(*args, **kwargs): - return Mockresponse("", 429, raise_error=True) - - def mock_send_500(*args, **kwargs): - return Mockresponse("", 500, raise_error=True) - - def mock_send_501(*args, **kwargs): - return Mockresponse("", 501, raise_error=True) - def mock_send_error(*args, **kwargs): + """Mock error response with description in \'error\' field. + + Returns: + requests.Response: Returns mock 404 error response. + """ content = '{"error": "Resource not found error message from API response field \'error\'."}' return get_mock_http_response(content, 404) def mock_send_message(*args, **kwargs): - content = '{"message": "Resource not found error message from API response field \'message\'."}' - return get_mock_http_response(content, 404) + """Mock error response with description in \'message\' field. - @mock.patch("requests.Session.request", side_effect=mock_send_400) - def test_request_with_handling_for_400_exception_handling(self, mock_send_400, mock_sleep): + Returns: + requests.Response: Returns mock 404 error response. """ - Test that `perform_request` method handle 400 error with proper message - """ - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.perform_request('GET') - except client.MixpanelBadRequestError as e: - expected_error_message = "HTTP-error-code: 400, Error: A validation exception has occurred.(Please verify your credentials.)" - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.request", side_effect=mock_400_different_timezone) - def test_request_with_handling_for_400_for_different_timezone(self, mock_400_different_timezone, mock_sleep): - """ - Test that `perform_request` method handle 400 error with proper message for different timezone - """ - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.perform_request('GET') - except client.MixpanelBadRequestError as e: - expected_error_message = "HTTP-error-code: 400, Error: A validation exception has occurred. Please validate the timezone with the MixPanel UI under project settings." - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) + content = '{"message": "Resource not found error message from API response field \'message\'."}' + return get_mock_http_response(content, 404) + @parameterized.expand([ + ["400 error", 400, MockResponse(400), client.MixpanelBadRequestError, "A validation exception has occurred.(Please verify your credentials.)"], + ["400 different timezone error", 400, mock_400_different_timezone(), client.MixpanelBadRequestError, "A validation exception has occurred. Please validate the timezone with the MixPanel UI under project settings."], + ["400 timeout error", 400, MockResponse(400, text=timeout_400_error), client.MixpanelBadRequestError, "Timeout Error.(Please verify your credentials.)"], + ["401 error", 401, MockResponse(401), client.MixpanelUnauthorizedError, "Invalid authorization credentials."], + ["402 error", 402, MockResponse(402), client.MixpanelPaymentRequiredError, "Your current plan does not allow API calls. Payment is required to complete the operation."], + ["403 error", 403, MockResponse(403), client.MixpanelForbiddenError, "User does not have permission to access the resource."], + ["404 error", 404, MockResponse(404), client.MixpanelNotFoundError, "The resource you have specified cannot be found."], + ["404 error", 404, mock_send_error(), client.MixpanelNotFoundError, "Resource not found error message from API response field 'error'."], + ["404 error", 404, mock_send_message(), client.MixpanelNotFoundError, "Resource not found error message from API response field 'message'."], + ["429 error", 429, MockResponse(429), client.Server429Error, "The API rate limit for your organization/application pairing has been exceeded."], + ]) @mock.patch("requests.Session.request") - def test_request_with_handling_for_400_timeout_error_handling(self, mock_request, mock_sleep): - """ - Test that `perform_request` method handle 400 error with timeout error message in case of `error` field in response - """ - error = {"request": "/api/2.0/engage/revenue?from_date=2020-02-01&to_date=2020-03-01", "error": "Timeout Error."} - mock_request.return_value = Mockresponse("", 400, raise_error=True, text=error) - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.perform_request('GET') - except client.MixpanelBadRequestError as e: - expected_error_message = "HTTP-error-code: 400, Error: Timeout Error.(Please verify your credentials.)" - # Verifying the message formed for the timeout error - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.request", side_effect=mock_send_401) - def test_request_with_handling_for_401_exception_handling(self, mock_send_401, mock_sleep): - """ - Test that `perform_request` method handle 401 error with proper message - """ - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.perform_request('GET') - except client.MixpanelUnauthorizedError as e: - expected_error_message = "HTTP-error-code: 401, Error: Invalid authorization credentials." - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.request", side_effect=mock_send_402) - def test_request_with_handling_for_402_exception_handling(self, mock_send_402, mock_sleep): - """ - Test that `perform_request` method handle 402 error with proper message - """ - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.perform_request('GET') - except client.MixpanelPaymentRequiredError as e: - expected_error_message = "HTTP-error-code: 402, Error: Your current plan does not allow API calls. Payment is required to complete the operation." - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.request", side_effect=mock_send_403) - def test_request_with_handling_for_403_exception_handling(self, mock_send_403, mock_sleep): - """ - Test that `perform_request` method handle 403 error with proper message - """ - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.perform_request('GET') - except client.MixpanelForbiddenError as e: - expected_error_message = "HTTP-error-code: 403, Error: User does not have permission to access the resource." - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.request", side_effect=mock_send_404) - def test_request_with_handling_for_404_exception_handling(self, mock_send_404, mock_sleep): - """ - Test that `perform_request` method handle 404 error with proper message - """ - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.perform_request('GET') - except client.MixpanelNotFoundError as e: - expected_error_message = "HTTP-error-code: 404, Error: The resource you have specified cannot be found." - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.request", side_effect=mock_send_429) - def test_request_with_handling_for_429_exception_handling(self, mock_send_429, mock_sleep): - """ - Test that `perform_request` method handle 429 error with proper message - """ - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.perform_request('GET') - except client.Server429Error as e: - expected_error_message = "HTTP-error-code: 429, Error: The API rate limit for your organisation/application pairing has been exceeded." - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.request", side_effect=mock_send_500) - def test_request_with_handling_for_500_exception_handling(self, mock_send_500, mock_sleep): - """ - Test that `perform_request` method handle 500 error with proper message - """ - with self.assertRaises(client.MixpanelInternalServiceError): - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.perform_request('GET') - - @mock.patch("requests.Session.request", side_effect=mock_send_501) - def test_request_with_handling_for_501_exception_handling(self, mock_send_501, mock_sleep): - """ - Test that `perform_request` method handle 501 error with proper message - """ - with self.assertRaises(client.Server5xxError): - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.perform_request('GET') - - @mock.patch("requests.Session.request", side_effect=mock_send_error) - def test_request_with_handling_for_404_exception_handling_error(self, mock_send_error, mock_sleep): - ''' - Verify that if 'error' field is present in API response then it should be used as error message. - ''' - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.perform_request('GET') - except client.MixpanelNotFoundError as e: - expected_error_message = "HTTP-error-code: 404, Error: Resource not found error message from API response field 'error'." - # Verifying the message retrived from 'error' field of API response - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.request", side_effect=mock_send_message) - def test_request_with_handling_for_404_exception_handling_message(self, mock_send_message, mock_sleep): - ''' - Verify that if 'message' field is present in API response then it should be used as error message. - ''' - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.perform_request('GET') - except client.MixpanelNotFoundError as e: - expected_error_message = "HTTP-error-code: 404, Error: Resource not found error message from API response field 'message'." - # Verifying the message retrived from 'message' field of API response - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.get", side_effect=mock_send_400) - def test_check_access_with_handling_for_400_exception_handling(self, mock_send_400, mock_sleep): - """ - Test that `check_access` method handle 404 error with proper message - """ - try: - tap_stream_id = "tap_mixpanel" - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.check_access() - except client.MixpanelBadRequestError as e: - expected_error_message = "HTTP-error-code: 400, Error: A validation exception has occurred.(Please verify your credentials.)" - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.get", side_effect=mock_400_different_timezone) - def test_check_access_with_handling_for_400_for_different_timezone(self, mock_400_different_timezone, mock_sleep): - """ - Test that `check_access` method handle 404 error with proper message for different timezone - """ - try: - tap_stream_id = "tap_mixpanel" - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.check_access() - except client.MixpanelBadRequestError as e: - expected_error_message = "HTTP-error-code: 400, Error: A validation exception has occurred. Please validate the timezone with the MixPanel UI under project settings." - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) - + def test_perform_request_exception_handling( + self, test_name, error_code, mock_response, error, error_message, mock_request, mock_sleep, + ): + """ + Test that `perform_request` method handle error with proper message. + """ + mock_request.return_value = mock_response + mock_client = client.MixpanelClient( + api_secret="mock_api_secret", + api_domain="mock_api_domain", + request_timeout=REQUEST_TIMEOUT, + ) + with self.assertRaises(error) as e: + mock_client.perform_request("GET") + + expected_error_message = f"HTTP-error-code: {error_code}, Error: {error_message}" + + # Verifying the message formed for the custom exception + self.assertEqual(str(e.exception), expected_error_message) + + @parameterized.expand([ + ["400 error", 400, MockResponse(400), client.MixpanelBadRequestError, "A validation exception has occurred.(Please verify your credentials.)"], + ["400 different timezone error", 400, mock_400_different_timezone(), client.MixpanelBadRequestError, "A validation exception has occurred. Please validate the timezone with the MixPanel UI under project settings."], + ["400 timeout error", 400, MockResponse(400, text=timeout_400_error), client.MixpanelBadRequestError, "Timeout Error.(Please verify your credentials.)"], + ["401 error", 401, MockResponse(401), client.MixpanelUnauthorizedError, "Invalid authorization credentials."], + ["403 error", 403, MockResponse(403), client.MixpanelForbiddenError, "User does not have permission to access the resource."], + ["404 error", 404, MockResponse(404), client.MixpanelNotFoundError, "The resource you have specified cannot be found."], + ["404 error", 404, mock_send_error(), client.MixpanelNotFoundError, "Resource not found error message from API response field 'error'."], + ["404 error", 404, mock_send_message(), client.MixpanelNotFoundError, "Resource not found error message from API response field 'message'."], + ["429 error", 429, MockResponse(429), client.Server429Error, "The API rate limit for your organization/application pairing has been exceeded."], + ["500 error", 500, MockResponse(500), client.MixpanelInternalServiceError, "Server encountered an unexpected condition that prevented it from fulfilling the request."], + ["501 error", 501, MockResponse(501), client.MixpanelError, "Unknown Error"], + ]) @mock.patch("requests.Session.get") - def test_check_access_with_handling_for_400_timeout_error_handling(self, mock_request, mock_sleep): - """ - Test that `check_access` method handle 404 error with timeout error message in case of `error` field in response - """ - error = {"request": "/api/2.0/engage/revenue?from_date=2020-02-01&to_date=2020-03-01", "error": "Timeout Error."} - mock_request.return_value = Mockresponse("", 400, raise_error=True, text=error) - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.check_access() - except client.MixpanelBadRequestError as e: - expected_error_message = "HTTP-error-code: 400, Error: Timeout Error.(Please verify your credentials.)" - # Verifying the message formed for the timeout error - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.request", side_effect=mock_send_401) - def test_check_access_with_handling_for_401_exception_handling(self, mock_send_401, mock_sleep): - """ - Test that `check_access` method handle 401 error with proper message - """ - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) + def test_check_access_exception_handling( + self, test_name, error_code, mock_response, error, error_message, mock_request, mock_sleep, + ): + """ + Test that `check_access` method handle error with proper message. + """ + mock_request.return_value = mock_response + mock_client = client.MixpanelClient( + api_secret="mock_api_secret", + api_domain="mock_api_domain", + request_timeout=REQUEST_TIMEOUT, + ) + with self.assertRaises(error) as e: mock_client.check_access() - except client.MixpanelUnauthorizedError as e: - expected_error_message = "HTTP-error-code: 401, Error: Invalid authorization credentials." - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) - @mock.patch("requests.Session.request", side_effect=mock_send_403) - def test_check_access_with_handling_for_403_exception_handling(self, mock_send_403, mock_sleep): - """ - Test that `check_access` method handle 403 error with proper message - """ - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.check_access() - except client.MixpanelForbiddenError as e: - expected_error_message = "HTTP-error-code: 403, Error: User does not have permission to access the resource." - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) + expected_error_message = f"HTTP-error-code: {error_code}, Error: {error_message}" - @mock.patch("requests.Session.request", side_effect=mock_send_404) - def test_check_access_with_handling_for_404_exception_handling(self, mock_send_404, mock_sleep): - """ - Test that `check_access` method handle 404 error with proper message - """ - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.check_access() - except client.MixpanelNotFoundError as e: - expected_error_message = "HTTP-error-code: 404, Error: The resource you have specified cannot be found." - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) + # Verifying the message formed for the custom exception + self.assertEqual(str(e.exception), expected_error_message) - @mock.patch("requests.Session.request", side_effect=mock_send_429) - def test_check_access_with_handling_for_429_exception_handling(self, mock_send_429, mock_sleep): - """ - Test that `check_access` method handle 429 error with proper message - """ - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.check_access() - except client.Server429Error as e: - expected_error_message = "HTTP-error-code: 429, Error: The API rate limit for your organisation/application pairing has been exceeded." - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.request", side_effect=mock_send_500) - def test_check_access_with_handling_for_500_exception_handling(self, mock_send_500, mock_sleep): - """ - Test that `check_access` method handle 500 error with proper message - """ - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.check_access() - except client.MixpanelInternalServiceError as e: - expected_error_message = "HTTP-error-code: 500, Error: Server encountered an unexpected condition that prevented it from fulfilling the request." - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.request", side_effect=mock_send_501) - def test_check_access_with_handling_for_501_exception_handling(self, mock_send_501, mock_sleep): + @parameterized.expand([ + ["500 error", MockResponse(500), client.MixpanelInternalServiceError], + ["501 error", MockResponse(501), client.Server5xxError], + ]) + @mock.patch("requests.Session.request") + def test_request_with_handling_for_5xx_exception_handling( + self, test_name, mock_response, error, mock_request, mock_sleep + ): """ - Test that `check_access` method handle 501 error with proper message + Test that `perform_request` method handle 5xx error with proper message. """ - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.check_access() - except client.MixpanelError as e: - expected_error_message = "HTTP-error-code: 501, Error: Unknown Error" - # Verifying the message formed for the custom exception - self.assertEqual(str(e), expected_error_message) - - - @mock.patch("requests.Session.request", side_effect=mock_send_error) - def test_check_access_with_handling_for_404_exception_handling_error(self, mock_send_error, mock_sleep): - ''' - Verify that if 'error' field is present in API response then it should be used as error message. - ''' - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.check_access() - except client.MixpanelNotFoundError as e: - expected_error_message = "HTTP-error-code: 404, Error: Resource not found error message from API response field 'error'." - # Verifying the message retrived from 'error' field of API response - self.assertEqual(str(e), expected_error_message) - - @mock.patch("requests.Session.request", side_effect=mock_send_message) - def test_check_access_with_handling_for_404_exception_handling_message(self, mock_send_message, mock_sleep): - ''' - Verify that if 'message' field is present in API response then it should be used as error message. - ''' - try: - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) - mock_client.check_access() - except client.MixpanelNotFoundError as e: - expected_error_message = "HTTP-error-code: 404, Error: Resource not found error message from API response field 'message'." - # Verifying the message retrived from 'message' field of API response - self.assertEqual(str(e), expected_error_message) + mock_request.return_value = mock_response + mock_client = client.MixpanelClient( + api_secret="mock_api_secret", + api_domain="mock_api_domain", + request_timeout=REQUEST_TIMEOUT, + ) + with self.assertRaises(error): + mock_client.perform_request("GET") @mock.patch("requests.Session.request", side_effect=requests.exceptions.Timeout) def test_check_access_handle_timeout_error(self, mock_request, mock_sleep): - ''' - Check whether the request backoffs properly for `check_access` method for 5 times in case of Timeout error. - ''' - mock_client = client.MixpanelClient(api_secret="mock_api_secret", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT) + """ + Check whether the request back off properly for `check_access` + method for 5 times in case of Timeout error. + """ + mock_client = client.MixpanelClient( + api_secret="mock_api_secret", + api_domain="mock_api_domain", + request_timeout=REQUEST_TIMEOUT, + ) with self.assertRaises(client.ReadTimeoutError): mock_client.check_access() - + # Verify that requests.Session.request is called 5 times - self.assertEqual(mock_request.call_count, 5) \ No newline at end of file + self.assertEqual(mock_request.call_count, 5) diff --git a/tests/unittests/test_medium_client.py b/tests/unittests/test_medium_client.py index ddba05b..761aad5 100644 --- a/tests/unittests/test_medium_client.py +++ b/tests/unittests/test_medium_client.py @@ -1,118 +1,174 @@ from collections.abc import Generator from unittest import mock -from unittest.mock import patch -import backoff import requests import requests_mock from pytest import raises + from tap_mixpanel import client -from tap_mixpanel.client import (ReadTimeoutError, Server5xxError, - Server429Error, MixpanelInternalServiceError) +from tap_mixpanel.client import ( + MixpanelInternalServiceError, + ReadTimeoutError, + Server5xxError, + Server429Error, +) from tests.configuration.fixtures import mixpanel_client -@mock.patch('time.sleep', return_value=None) +@mock.patch("time.sleep", return_value=None) def test_request_export_backoff_on_timeout(mock_sleep, mixpanel_client): - with requests_mock.Mocker() as m: - m.request('GET', 'http://test.com', - exc=requests.exceptions.Timeout('Timeout on request')) - - with raises(ReadTimeoutError) as ex: - for record in mixpanel_client.request_export('GET', url='http://test.com'): + """ + Test that request_export method of the client backoff max times + (time.sleep called 'Max-1' times) if timeout error occur. + """ + with requests_mock.Mocker() as mocker: + mocker.request( + "GET", + "http://test.com", + exc=requests.exceptions.Timeout("Timeout on request"), + ) + + with raises(ReadTimeoutError): + for record in mixpanel_client.request_export("GET", url="http://test.com"): pass + # Assert backoff retry count as expected assert mock_sleep.call_count == client.BACKOFF_MAX_TRIES_REQUEST - 1 -@mock.patch('time.sleep', return_value=None) +@mock.patch("time.sleep", return_value=None) def test_request_export_backoff_on_remote_timeout(mock_sleep, mixpanel_client): - with requests_mock.Mocker() as m: - m.request('GET', 'http://test.com', text=None, status_code=504) - result = mixpanel_client.request_export('GET', url='http://test.com') - - with raises(Server5xxError) as ex: - for record in result: + """ + Test that request_export method of the client backoff max times + (time.sleep called 'Max-1' times) if 504 error occur. + """ + with requests_mock.Mocker() as mocker: + mocker.request("GET", "http://test.com", text=None, status_code=504) + + with raises(Server5xxError): + for _ in mixpanel_client.request_export("GET", url="http://test.com"): pass + # Assert backoff retry count as expected assert mock_sleep.call_count == client.BACKOFF_MAX_TRIES_REQUEST - 1 -@mock.patch('time.sleep', return_value=None) +@mock.patch("time.sleep", return_value=None) def test_perform_request_backoff_on_remote_timeout_429(mock_sleep, mixpanel_client): - with requests_mock.Mocker() as m: - m.request('GET', 'http://test.com', text=None, - content=b'error', status_code=429) - - with raises(Server429Error) as ex: - result = mixpanel_client.perform_request( - 'GET', url='http://test.com') - for record in result: - pass + """ + Test that perform_request method of the client backoff max times + (time.sleep called 'Max-1' times) if 429 error occur. + """ + with requests_mock.Mocker() as mocker: + mocker.request( + "GET", "http://test.com", text=None, content=b"error", status_code=429 + ) + + with raises(Server429Error): + mixpanel_client.perform_request("GET", url="http://test.com") + # Assert backoff retry count as expected assert mock_sleep.call_count == client.BACKOFF_MAX_TRIES_REQUEST - 1 -@mock.patch('time.sleep', return_value=None) +@mock.patch("time.sleep", return_value=None) def test_perform_request_backoff_on_remote_timeout_500(mock_sleep, mixpanel_client): - with requests_mock.Mocker() as m: - m.request('GET', 'http://test.com', text=None, status_code=500) + """ + Test that perform_request method of the client backoff max times + (time.sleep called 'Max-1' times) if 500 error occur. + """ + with requests_mock.Mocker() as mocker: + mocker.request("GET", "http://test.com", text=None, status_code=500) - with raises(MixpanelInternalServiceError) as ex: - result = mixpanel_client.perform_request( - 'GET', url='http://test.com') + with raises(MixpanelInternalServiceError): + mixpanel_client.perform_request("GET", url="http://test.com") - for record in result: - pass # Assert backoff retry count as expected assert mock_sleep.call_count == client.BACKOFF_MAX_TRIES_REQUEST - 1 -@mock.patch('time.sleep', return_value=None) +@mock.patch("time.sleep", return_value=None) def test_check_access_backoff_on_remote_timeout_429(mock_sleep, mixpanel_client): - with requests_mock.Mocker() as m: - m.request('GET', 'https://mixpanel.com/api/2.0/engage', - content=b'error', text=None, status_code=429) + """ + Test that check_access method of the client backoff 5 times (time.sleep called 4 times) + if 429 error occur. + """ + with requests_mock.Mocker() as mocker: + mocker.request( + "GET", + "https://mixpanel.com/api/2.0/engage", + content=b"error", + text=None, + status_code=429, + ) + + with raises(Server429Error): + mixpanel_client.check_access() - with raises(Server429Error) as ex: - result = mixpanel_client.check_access() # Assert backoff retry count as expected assert mock_sleep.call_count == 5 - 1 -@mock.patch('time.sleep', return_value=None) +@mock.patch("time.sleep", return_value=None) def test_check_access_backoff_on_remote_timeout_500(mock_sleep, mixpanel_client): - with requests_mock.Mocker() as m: - m.request('GET', 'https://mixpanel.com/api/2.0/engage', - content=b'error', text=None, status_code=500) + """ + Test that check_access method of the client backoff 5 times (time.sleep called 4 times) + if 500 error occur. + """ + with requests_mock.Mocker() as mocker: + mocker.request( + "GET", + "https://mixpanel.com/api/2.0/engage", + content=b"error", + text=None, + status_code=500, + ) + + with raises(MixpanelInternalServiceError): + mixpanel_client.check_access() - with raises(MixpanelInternalServiceError) as ex: - result = mixpanel_client.check_access() # Assert backoff retry count as expected assert mock_sleep.call_count == 5 - 1 -@mock.patch('time.sleep', return_value=None) +@mock.patch("time.sleep", return_value=None) def test_request_backoff_on_timeout(mock_sleep, mixpanel_client): - with requests_mock.Mocker() as m: - m.request('GET', 'http://test.com', - exc=requests.exceptions.Timeout('Timeout on request')) + """ + Test that for te `request` method of the client back max times for the timeout. + """ + with requests_mock.Mocker() as mocker: + mocker.request( + "GET", + "http://test.com", + exc=requests.exceptions.Timeout("Timeout on request"), + ) + + with raises(ReadTimeoutError): + mixpanel_client.request("GET", url="http://test.com") - with raises(ReadTimeoutError) as ex: - result = mixpanel_client.request('GET', url='http://test.com') # Assert backoff retry count as expected assert mock_sleep.call_count == client.BACKOFF_MAX_TRIES_REQUEST - 1 def test_request_returns_json(mixpanel_client): - with requests_mock.Mocker() as m: - m.request('GET', 'http://test.com', json={'a': 'b'}) - result = mixpanel_client.request('GET', url='http://test.com') - assert result == {'a': 'b'} + """ + Test that request method of the client returns json object. + """ + with requests_mock.Mocker() as mocker: + mocker.request("GET", "http://test.com", json={"a": "b"}) + result = mixpanel_client.request("GET", url="http://test.com") + + # Verify that returned object is expected JSON. + assert result == {"a": "b"} def test_request_export_returns_generator(mixpanel_client): - with requests_mock.Mocker() as m: - m.request('GET', 'http://test.com', json={'a': 'b'}) - result = mixpanel_client.request_export('GET', url='http://test.com') + """ + Test that request method of the client returns an generator object. + """ + with requests_mock.Mocker() as mocker: + mocker.request("GET", "http://test.com", json={"a": "b"}) + result = mixpanel_client.request_export("GET", url="http://test.com") + + # Verify that returned object is a generator object. assert isinstance(result, Generator) diff --git a/tests/unittests/test_request_timeout_param_value.py b/tests/unittests/test_request_timeout_param_value.py index c6ea20d..e9ca91b 100644 --- a/tests/unittests/test_request_timeout_param_value.py +++ b/tests/unittests/test_request_timeout_param_value.py @@ -1,5 +1,7 @@ import unittest from unittest import mock +from parameterized import parameterized + from tap_mixpanel.__init__ import main CONFIG = { @@ -11,113 +13,95 @@ "start_date": "2020-02-01T00:00:00Z", "user_agent": "tap-mixpanel ", "eu_residency": False, - "end_date": "2020-03-02T00:00:00Z" + "end_date": "2020-03-02T00:00:00Z", } -REQUEST_TIMEOUT = 300 -REQUEST_TIMEOUT_FLOAT = 300.0 +REQUEST_TIMEOUT_DEFAULT = 300 +TIMEOUT_FLOAT = 200.0 +TIMEOUT_INT = 200 +NULL_STRING = "" +ZERO_INT = 0 +ZERO_STRING = "0" +STRING_INT = "200" + + +class MockParseArgs: + """Mocked MockParseArgs class with custom state, discover, config + attributes to pass unit test cases.""" -class MockParseArgs(): - """ - Mocked MockParseArgs class with custom state, discover, config attributes to pass unit test cases. - """ def __init__(self, state, discover, config): self.state = state self.discover = discover self.config = config + class Mockresponse: - """ - Mocked standard HTTPResponse. - """ + """Mocked standard HTTPResponse.""" + def __init__(self, resp, status_code): self.json_data = resp self.status_code = status_code + HEADER = { - 'User-Agent': 'tap-mixpanel ', - 'Accept': 'application/json', - 'Authorization': 'Basic ZHVtbXlfc2VjcmV0' - } - -@mock.patch("requests.Session.request", return_value = Mockresponse("", status_code=200)) + "User-Agent": "tap-mixpanel ", + "Accept": "application/json", + "Authorization": "Basic ZHVtbXlfc2VjcmV0", +} + + +@mock.patch("requests.Session.request", return_value=Mockresponse("", status_code=200)) @mock.patch("singer.utils.parse_args") -@mock.patch("tap_mixpanel.__init__.do_discover", return_value = '') +@mock.patch("tap_mixpanel.__init__.do_discover", return_value="") class TestMixpanelRequestTimeoutParameterValue(unittest.TestCase): - """Test that tap handles different type of request_timeout parameter values""" - def test_request_timeout_for_none_param_value(self, mock_discover, mock_parse_args, mock_request): - """Test that tap handles none value of request_timeout parameter""" + """ + Test that tap handles different type of request_timeout parameter + values. + """ + + def test_request_timeout_for_none_param_value( + self, mock_discover, mock_parse_args, mock_request + ): + """Test that tap handles none value of request_timeout parameter.""" config = CONFIG.copy() - mock_parse_args.return_value = MockParseArgs(state = {}, discover = True, config=config) - r = main() + mock_parse_args.return_value = MockParseArgs( + state={}, discover=True, config=config + ) + main() # Verify that request method called with expected parameter value when"request_timeout" is None - mock_request.assert_called_with('GET','https://mixpanel.com/api/2.0/engage', allow_redirects=True, - headers=HEADER, timeout=REQUEST_TIMEOUT) - - def test_request_timeout_for_empty_param_value(self, mock_discover, mock_parse_args, mock_request): - """Test that tap handles empty value of request_timeout parameter""" - config = CONFIG.copy() - config['request_timeout'] = "" - mock_parse_args.return_value = MockParseArgs(state = {}, discover = True, config=config) - r = main() - - # Verify that request method called with expected parameter value when"request_timeout" is empty string - mock_request.assert_called_with('GET','https://mixpanel.com/api/2.0/engage', allow_redirects=True, - headers=HEADER, timeout=REQUEST_TIMEOUT) - - def test_request_timeout_for_string_param_value(self, mock_discover, mock_parse_args, mock_request): - """Test that tap handles string value of request_timeout parameter""" - config = CONFIG.copy() - config['request_timeout'] = "100" - mock_parse_args.return_value = MockParseArgs(state = {}, discover = True, config=config) - r = main() - - # Verify that request method called with expected parameter value when"request_timeout" is string - mock_request.assert_called_with('GET','https://mixpanel.com/api/2.0/engage', allow_redirects=True, - headers=HEADER, timeout=100.0) - - def test_request_timeout_for_int_param_value(self, mock_discover, mock_parse_args, mock_request): - """Test that tap handles int value of request_timeout parameter""" - config = CONFIG.copy() - config['request_timeout'] = 200 - mock_parse_args.return_value = MockParseArgs(state = {}, discover = True, config=config) - r = main() - - # Verify that request method called with expected parameter value when"request_timeout" is int - mock_request.assert_called_with('GET','https://mixpanel.com/api/2.0/engage', allow_redirects=True, - headers=HEADER, timeout=200.0) - - def test_request_timeout_for_float_param_value(self, mock_discover, mock_parse_args, mock_request): - """Test that tap handles float value of request_timeout parameter""" - config = CONFIG.copy() - config['request_timeout'] = REQUEST_TIMEOUT_FLOAT - mock_parse_args.return_value = MockParseArgs(state = {}, discover = True, config=config) - r = main() - - # Verify that request method called with expected parameter value when"request_timeout" is float - mock_request.assert_called_with('GET','https://mixpanel.com/api/2.0/engage', allow_redirects=True, - headers=HEADER, timeout=REQUEST_TIMEOUT_FLOAT) - - def test_request_timeout_for_zero_int_param_value(self, mock_discover, mock_parse_args, mock_request): - """Test that tap handles int 0 value of request_timeout parameter""" - config = CONFIG.copy() - config['request_timeout'] = 0 - mock_parse_args.return_value = MockParseArgs(state = {}, discover = True, config=config) - r = main() - - # Verify that request method called with expected parameter value when"request_timeout" is int 0 - mock_request.assert_called_with('GET','https://mixpanel.com/api/2.0/engage', allow_redirects=True, - headers=HEADER, timeout=REQUEST_TIMEOUT) - - def test_request_timeout_for_zero_string_param_value(self, mock_discover, mock_parse_args, mock_request): - """Test that tap handles string 0 value of request_timeout parameter""" - config = CONFIG.copy() - config['request_timeout'] = "0" - mock_parse_args.return_value = MockParseArgs(state = {}, discover = True, config=config) - r = main() + mock_request.assert_called_with( + "GET", + "https://mixpanel.com/api/2.0/engage", + allow_redirects=True, + headers=HEADER, + timeout=REQUEST_TIMEOUT_DEFAULT, + ) - # Verify that request method called with expected parameter value when"request_timeout" is string 0 - mock_request.assert_called_with('GET','https://mixpanel.com/api/2.0/engage', allow_redirects=True, - headers=HEADER, timeout=REQUEST_TIMEOUT) + @parameterized.expand([ + ["empty value", NULL_STRING, REQUEST_TIMEOUT_DEFAULT], + ["string value", STRING_INT, TIMEOUT_FLOAT], + ["integer value", TIMEOUT_INT, TIMEOUT_FLOAT], + ["float value", TIMEOUT_FLOAT, TIMEOUT_FLOAT], + ["zero value", ZERO_INT, REQUEST_TIMEOUT_DEFAULT], + ["zero(string) value", ZERO_STRING, REQUEST_TIMEOUT_DEFAULT], + ]) + def test_request_timeout( + self, mock_discover, mock_parse_args, mock_request, test_name, input_value, expected_value + ): + """Test that tap handles various request timeout values.""" + config = CONFIG.copy() + config["request_timeout"] = input_value + mock_parse_args.return_value = MockParseArgs( + state={}, discover=True, config=config + ) + main() + # Verify that request method called with expected parameter value + mock_request.assert_called_with( + "GET", + "https://mixpanel.com/api/2.0/engage", + allow_redirects=True, + headers=HEADER, + timeout=expected_value, + ) diff --git a/tests/unittests/test_support_eu_endpoints.py b/tests/unittests/test_support_eu_endpoints.py index 50eba94..3438276 100644 --- a/tests/unittests/test_support_eu_endpoints.py +++ b/tests/unittests/test_support_eu_endpoints.py @@ -1,19 +1,20 @@ import unittest from unittest import mock -from tap_mixpanel.streams import Revenue, Export -from tap_mixpanel.client import MixpanelClient + from tap_mixpanel.__init__ import main +from tap_mixpanel.client import MixpanelClient +from tap_mixpanel.streams import Export, Revenue EU_CONFIG = { - "api_secret": "dummy_secret", - "date_window_size": "30", - "attribution_window": "5", - "project_timezone": "Europe/Amsterdam", - "select_properties_by_default": "true", - "start_date": "2020-02-01T00:00:00Z", - "user_agent": "tap-mixpanel ", - "eu_residency": True, - "end_date": "2020-03-02T00:00:00Z" + "api_secret": "dummy_secret", + "date_window_size": "30", + "attribution_window": "5", + "project_timezone": "Europe/Amsterdam", + "select_properties_by_default": "true", + "start_date": "2020-02-01T00:00:00Z", + "user_agent": "tap-mixpanel ", + "eu_residency": True, + "end_date": "2020-03-02T00:00:00Z", } STANDARD_CONFIG = { "api_secret": "dummy_secret", @@ -24,28 +25,43 @@ "start_date": "2020-02-01T00:00:00Z", "user_agent": "tap-mixpanel ", "eu_residency": False, - "end_date": "2020-03-02T00:00:00Z" + "end_date": "2020-03-02T00:00:00Z", } -class MockStream(): + +class MockStream: + """Mock stream object class""" + def __init__(self, stream): self.stream = stream -class MockCatalog(): + +class MockCatalog: + """Mock catalog object class.""" + def __init__(self, name): self.name = name def get_selected_streams(self, state): + """Returns the list of selected stream objects.""" return [MockStream(self.name)] -class MockParseArgs(): + +class MockParseArgs: + """Mock args object class.""" + def __init__(self, state, discover, config): self.state = state self.discover = discover self.config = config -class Mockresponse: - def __init__(self, resp, status_code, content=[""], headers=None, raise_error=False, text={}): + +class MockResponse: + """Mocked standard HTTPResponse to test error handling.""" + + def __init__( + self, resp, status_code, content=[""], headers=None, raise_error=False, text={} + ): self.json_data = resp self.status_code = status_code self.content = content @@ -54,84 +70,166 @@ def __init__(self, resp, status_code, content=[""], headers=None, raise_error=Fa self.text = text self.reason = "error" - def prepare(self): - return (self.json_data, self.status_code, self.content, self.headers, self.raise_error) - def json(self): + """Returns a JSON object of the result.""" return self.text + class TestMixpanelSupportEuEndpoints(unittest.TestCase): + """ + Test that europe domain support is working. + """ @mock.patch("tap_mixpanel.client.MixpanelClient.request") @mock.patch("tap_mixpanel.streams.MixPanel.write_bookmark") @mock.patch("tap_mixpanel.streams.MixPanel.write_schema") - def test_support_eu_endpoints_except_export(self, mock_write_schema, mock_write_bookmark, mock_request): + def test_support_eu_endpoints_except_export( + self, mock_write_schema, mock_write_bookmark, mock_request + ): + """ + Test case for the streams other than export stream that, + For eu_residency europe domain base url is called. + And for eu_residency 'false' in the config, default domain URL is called. + """ mock_request.return_value = {} - mock_write_schema.return_value = '' - mock_write_bookmark.return_value = '' + mock_write_schema.return_value = "" + mock_write_bookmark.return_value = "" state = {} - catalog = MockCatalog('revenue') + catalog = MockCatalog("revenue") - client = MixpanelClient('','','') + client = MixpanelClient("", "", "") revenue_obj = Revenue(client) - revenue_obj.sync(catalog=catalog, state=state, config=EU_CONFIG, start_date="2020-02-01T00:00:00Z") - - mock_request.assert_called_with(method='GET', url='https://eu.mixpanel.com/api/2.0', path='engage/revenue', - params='unit=day&from_date=2020-02-01&to_date=2020-03-02', endpoint='revenue') + revenue_obj.sync( + catalog=catalog, + state=state, + config=EU_CONFIG, + start_date="2020-02-01T00:00:00Z", + ) + + # Verify that with EU config, base url has eu-domain. + mock_request.assert_called_with( + method="GET", + url="https://eu.mixpanel.com/api/2.0", + path="engage/revenue", + params="unit=day&from_date=2020-02-01&to_date=2020-03-02", + endpoint="revenue", + ) revenue_obj = Revenue(client) - revenue_obj.sync(catalog=catalog,state=state, config=STANDARD_CONFIG, start_date="2020-02-01T00:00:00Z") - - mock_request.assert_called_with(method='GET', url='https://mixpanel.com/api/2.0', path='engage/revenue', - params='unit=day&from_date=2020-02-01&to_date=2020-03-02', endpoint='revenue') - + revenue_obj.sync( + catalog=catalog, + state=state, + config=STANDARD_CONFIG, + start_date="2020-02-01T00:00:00Z", + ) + + # Verify that with standard config, base URL has default domain. + mock_request.assert_called_with( + method="GET", + url="https://mixpanel.com/api/2.0", + path="engage/revenue", + params="unit=day&from_date=2020-02-01&to_date=2020-03-02", + endpoint="revenue", + ) @mock.patch("tap_mixpanel.client.MixpanelClient.request_export") @mock.patch("tap_mixpanel.streams.MixPanel.write_bookmark") @mock.patch("tap_mixpanel.streams.MixPanel.write_schema") - def test_support_export_eu_endpoint(self, mock_write_schema, mock_write_bookmark, mock_request_export): + def test_support_export_eu_endpoint( + self, mock_write_schema, mock_write_bookmark, mock_request_export + ): + """ + Test case for the export stream (as it has different base url) that, + For eu_residency europe domain base url is called. + And for eu_residency 'false' in the config, default domain URL is called. + """ mock_request_export.return_value = {} - mock_write_schema.return_value = '' - mock_write_bookmark.return_value = '' + mock_write_schema.return_value = "" + mock_write_bookmark.return_value = "" state = {} - catalog = MockCatalog('export') + catalog = MockCatalog("export") - client = MixpanelClient('','','') + client = MixpanelClient("", "", "") export_obj = Export(client) - export_obj.sync(catalog=catalog,state=state, config=EU_CONFIG, start_date="2020-02-01T00:00:00Z") - - mock_request_export.assert_called_with(method='GET', url='https://data-eu.mixpanel.com/api/2.0', path='export', - params='from_date=2020-02-01&to_date=2020-03-02', endpoint='export') + export_obj.sync( + catalog=catalog, + state=state, + config=EU_CONFIG, + start_date="2020-02-01T00:00:00Z", + ) + + # Verify that with EU config, base url has eu-domain. + mock_request_export.assert_called_with( + method="GET", + url="https://data-eu.mixpanel.com/api/2.0", + path="export", + params="from_date=2020-02-01&to_date=2020-03-02", + endpoint="export", + ) export_obj = Export(client) - export_obj.sync(catalog=catalog,state=state, config=STANDARD_CONFIG, start_date="2020-02-01T00:00:00Z") - - mock_request_export.assert_called_with(method='GET', url='https://data.mixpanel.com/api/2.0', path='export', - params='from_date=2020-02-01&to_date=2020-03-02', endpoint='export') - + export_obj.sync( + catalog=catalog, + state=state, + config=STANDARD_CONFIG, + start_date="2020-02-01T00:00:00Z", + ) + + # Verify that with standard config, base URL has default domain. + mock_request_export.assert_called_with( + method="GET", + url="https://data.mixpanel.com/api/2.0", + path="export", + params="from_date=2020-02-01&to_date=2020-03-02", + endpoint="export", + ) @mock.patch("requests.Session.request") @mock.patch("singer.utils.parse_args") @mock.patch("tap_mixpanel.__init__.do_discover") - def test_support_eu_endpoint_in_discover(self, mock_discover, mock_parse_args, mock_request): - - mock_request.return_value = Mockresponse("", status_code=200) - mock_discover.return_value = '' - mock_parse_args.return_value = MockParseArgs(state = {}, discover = True, config=EU_CONFIG) - r = main() - + def test_support_eu_endpoint_in_discover( + self, mock_discover, mock_parse_args, mock_request + ): + """ + Test case for the discover mode, + For eu_residency europe domain base url is called. + And for eu_residency 'false' in the config, default domain URL is called. + """ + + mock_request.return_value = MockResponse("", status_code=200) + mock_discover.return_value = "" + mock_parse_args.return_value = MockParseArgs( + state={}, discover=True, config=EU_CONFIG + ) header = { - 'User-Agent': 'tap-mixpanel ', - 'Accept': 'application/json', - 'Authorization': 'Basic ZHVtbXlfc2VjcmV0' + "User-Agent": "tap-mixpanel ", + "Accept": "application/json", + "Authorization": "Basic ZHVtbXlfc2VjcmV0", } - mock_request.assert_called_with('GET','https://eu.mixpanel.com/api/2.0/engage', allow_redirects=True, - headers=header, timeout=300) - - mock_parse_args.return_value = MockParseArgs(state = {}, discover = True, config=STANDARD_CONFIG) - r = main() - mock_request.assert_called_with('GET','https://mixpanel.com/api/2.0/engage', allow_redirects=True, - headers=header, timeout=300) + main() + + # Verify that with EU config, base url has eu-domain. + mock_request.assert_called_with( + "GET", + "https://eu.mixpanel.com/api/2.0/engage", + allow_redirects=True, + headers=header, + timeout=300, + ) + + mock_parse_args.return_value = MockParseArgs( + state={}, discover=True, config=STANDARD_CONFIG + ) + main() + + # Verify that with standard config, base URL has default domain. + mock_request.assert_called_with( + "GET", + "https://mixpanel.com/api/2.0/engage", + allow_redirects=True, + headers=header, + timeout=300, + ) diff --git a/tests/unittests/test_transform_event_times.py b/tests/unittests/test_transform_event_times.py index 2fba5d3..37d4108 100644 --- a/tests/unittests/test_transform_event_times.py +++ b/tests/unittests/test_transform_event_times.py @@ -1,35 +1,53 @@ -from tap_mixpanel.transform import transform_event_times -import pytz import unittest from datetime import datetime +import pytz + +from tap_mixpanel.transform import transform_event_times + UTC = pytz.utc class TestTransformEventTimes(unittest.TestCase): + """ + Test that `transform_event_times` function formats, + the Eastern and UTC formatted dates to ISO datetime. + """ def test_utc_now(self): - + """ + Testcase for the UTC timezone is converted to the given format. + """ + input_time = datetime.utcnow() - + record = {"time": input_time.timestamp()} project_timezone = "UTC" - + actual = transform_event_times(record, project_timezone) - expected = {"time": input_time.astimezone(UTC).strftime("%04Y-%m-%dT%H:%M:%S.000000Z")} - + expected = { + "time": input_time.astimezone(UTC).strftime("%04Y-%m-%dT%H:%M:%S.000000Z") + } + + # Verify that record uis converted as expected. self.assertEqual(expected, actual) - - + def test_eastern_time(self): + """ + Testcase for the eastern timezone is converted to given formate. + """ + project_timezone = "US/Eastern" EASTERN = pytz.timezone(project_timezone) # This gives us 2021-08-12T11:00:00-4:00 input_time = EASTERN.localize(datetime(2021, 8, 12, 11, 0, 0)) - + record = {"time": input_time.timestamp()} actual = transform_event_times(record, project_timezone) - - expected = {"time": input_time.astimezone(UTC).strftime("%04Y-%m-%dT%H:%M:%S.000000Z")} - self.assertEqual(expected, actual) \ No newline at end of file + expected = { + "time": input_time.astimezone(UTC).strftime("%04Y-%m-%dT%H:%M:%S.000000Z") + } + + # Verify that record uis converted as expected. + self.assertEqual(expected, actual) From 40d63b1b4b03e483f6926039d974653e50019749 Mon Sep 17 00:00:00 2001 From: NevilParikh14 <92399024+NevilParikh14@users.noreply.github.com> Date: Mon, 3 Oct 2022 15:38:29 +0530 Subject: [PATCH 02/22] updated config.yml --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index d60c08a..fa4f5e9 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -30,7 +30,7 @@ jobs: command: | source /usr/local/share/virtualenvs/tap-mixpanel/bin/activate pip install coverage parameterized - python -m pytest --junitxml=junit/test-result.xml --cov=tap_mixpanel --cov-report=html tests/unittests/ + python -m pytest --junitxml=junit/test-result.xml --cov=tap_mixpanel --cov-report=html tests/unittests/ - store_test_results: path: test_output/report.xml - store_artifacts: From 48b65496d5af9dba2e73ef2accae6d0c4d097736 Mon Sep 17 00:00:00 2001 From: NevilParikh14 <92399024+NevilParikh14@users.noreply.github.com> Date: Mon, 3 Oct 2022 16:38:02 +0530 Subject: [PATCH 03/22] updated config.yml --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index fa4f5e9..74b8317 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -19,6 +19,7 @@ jobs: name: 'pylint tap' command: | source /usr/local/share/virtualenvs/tap-mixpanel/bin/activate + pip install pylint pylint tap_mixpanel -d 'broad-except,chained-comparison,empty-docstring,fixme,invalid-name,line-too-long,missing-module-docstring,no-else-raise,no-else-return,too-few-public-methods,too-many-arguments,too-many-branches,too-many-lines,too-many-locals,ungrouped-imports,too-many-public-methods,protected-access,too-many-statements,not-an-iterable' - run: name: 'JSON Validator' From 485e80407d9eb6aabc136e2b0eba92223c693267 Mon Sep 17 00:00:00 2001 From: NevilParikh14 <92399024+NevilParikh14@users.noreply.github.com> Date: Mon, 3 Oct 2022 16:42:04 +0530 Subject: [PATCH 04/22] updated base.py --- tests/tap_tester/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tap_tester/base.py b/tests/tap_tester/base.py index b3e24c0..0cf4558 100644 --- a/tests/tap_tester/base.py +++ b/tests/tap_tester/base.py @@ -8,7 +8,8 @@ import dateutil.parser import pytz -from tap_tester import LOGGER, BaseCase, connections, menagerie, runner +from tap_tester import LOGGER, connections, menagerie, runner +from tap_tester.base_case import BaseCase class TestMixPanelBase(BaseCase): From 9ce1cd57dcc9e1247f9ee8a506c9b65536941878 Mon Sep 17 00:00:00 2001 From: NevilParikh14 Date: Tue, 4 Oct 2022 16:49:17 +0530 Subject: [PATCH 05/22] Reverted all_fields and pagination test --- tests/tap_tester/test_mixpanel_all_fields.py | 125 -------------- .../test_mixpanel_all_fields_pagination.py | 154 ++++++++++++++++++ tests/tap_tester/test_mixpanel_pagination.py | 107 ------------ 3 files changed, 154 insertions(+), 232 deletions(-) delete mode 100644 tests/tap_tester/test_mixpanel_all_fields.py create mode 100644 tests/tap_tester/test_mixpanel_all_fields_pagination.py delete mode 100644 tests/tap_tester/test_mixpanel_pagination.py diff --git a/tests/tap_tester/test_mixpanel_all_fields.py b/tests/tap_tester/test_mixpanel_all_fields.py deleted file mode 100644 index 84f0ee2..0000000 --- a/tests/tap_tester/test_mixpanel_all_fields.py +++ /dev/null @@ -1,125 +0,0 @@ -from tap_tester import runner, connections, menagerie - -from base import TestMixPanelBase - - -class MixPanelAllFieldsTest(TestMixPanelBase): - - @staticmethod - def name(): - return "tap_tester_mixpanel_all_fields_test" - - def all_fields_test(self): - """ - All Fields Test. - - • Verify that when all fields are selected more than the automatic fields are replicated. - • Verify no unexpected streams were replicated - • Verify that more than just the automatic fields are replicated for each stream. - • Verify all fields for each stream are replicated - • Verify that the automatic fields are sent to the target - """ - - # Only following below 2 streams support pagination - expected_streams = self.expected_streams() - - expected_automatic_fields = self.expected_automatic_fields() - conn_id = connections.ensure_connection(self) - - found_catalogs = self.run_and_verify_check_mode(conn_id) - - # Table and field selection - test_catalogs_all_fields = [ - catalog - for catalog in found_catalogs - if catalog.get("tap_stream_id") in expected_streams - ] - - self.perform_and_verify_table_and_field_selection( - conn_id, test_catalogs_all_fields - ) - - # Grab metadata after performing table-and-field selection to set expectations - # used for asserting all fields are replicated - stream_to_all_catalog_fields = dict() - for catalog in test_catalogs_all_fields: - stream_id, stream_name = catalog["stream_id"], catalog["stream_name"] - catalog_entry = menagerie.get_annotated_schema(conn_id, stream_id) - fields_from_field_level_md = [ - md_entry["breadcrumb"][1] - for md_entry in catalog_entry["metadata"] - if md_entry["breadcrumb"] != [] - ] - stream_to_all_catalog_fields[stream_name] = set(fields_from_field_level_md) - - self.run_and_verify_sync(conn_id) - - actual_fields_by_stream = runner.examine_target_output_for_fields() - - synced_records = runner.get_records_from_target_output() - - # Verify no unexpected streams were replicated - synced_stream_names = set(synced_records.keys()) - self.assertSetEqual(expected_streams, synced_stream_names) - - # All Fields Test - for stream in expected_streams: - with self.subTest(logging="Primary Functional Test", stream=stream): - - # Expected values - expected_all_keys = stream_to_all_catalog_fields[stream] - expected_automatic_keys = expected_automatic_fields.get(stream, set()) - - # Collect actual values - messages = synced_records.get(stream) - actual_all_keys = set() - for message in messages["messages"]: - if message["action"] == "upsert": - actual_all_keys.update(set(message["data"].keys())) - - # Verify that the automatic fields are sent to the target - self.assertTrue( - actual_fields_by_stream.get(stream, set()).issuperset( - expected_automatic_keys - ), - msg="The fields sent to the target don't include all automatic fields", - ) - - # Verify that more than just the automatic fields are replicated for each stream. - # 'cohort_members' has just 2 key and both are automatic - if stream != "cohort_members": - self.assertGreater( - len(expected_all_keys), len(expected_automatic_keys) - ) - - self.assertTrue( - expected_automatic_keys.issubset(expected_all_keys), - msg=f'{expected_automatic_keys-expected_all_keys} is not in "expected_all_keys"', - ) - - # As we can't find the below fields in the docs and also - # it won't be generated by mixpanel APIs now so expected. - if stream == "export": - expected_all_keys = expected_all_keys - { - "labels", - "sampling_factor", - "dataset", - "mp_reserved_duration_s", - "mp_reserved_origin_end", - "mp_reserved_origin_start", - "mp_reserved_event_count", - } - - # Verify all fields for each stream are replicated. - # Skip engage as it return records in random manner with dynamic fields. - if not stream == "engage": - self.assertSetEqual(expected_all_keys, actual_all_keys) - - def test_run(self): - # Pagination test for standard server - self.eu_residency = False - self.all_fields_test() - - # Pagination test for EU residency server - self.eu_residency = True - self.all_fields_test() diff --git a/tests/tap_tester/test_mixpanel_all_fields_pagination.py b/tests/tap_tester/test_mixpanel_all_fields_pagination.py new file mode 100644 index 0000000..c4e8034 --- /dev/null +++ b/tests/tap_tester/test_mixpanel_all_fields_pagination.py @@ -0,0 +1,154 @@ +from math import ceil + +from tap_tester import connections, menagerie, runner + +from base import TestMixPanelBase + + +class MixPanelPaginationAllFieldsTest(TestMixPanelBase): + + @staticmethod + def name(): + return "tap_tester_mixpanel_pagination_all_fields_test" + + def pagination_test_run(self): + """ + All Fields Test + • Verify that when all fields are selected more than the automatic fields are replicated. + • Verify no unexpected streams were replicated + • Verify that more than just the automatic fields are replicated for each stream. + • Verify all fields for each stream are replicated + • Verify that the automatic fields are sent to the target + Pagination Test + • Verify that for each stream you can get multiple pages of data + • Verify no duplicate pages are replicated + • Verify no unexpected streams were replicated + PREREQUISITE + For EACH stream add enough data that you surpass the limit of a single + fetch of data. For instance if you have a limit of 250 records ensure + that 251 (or more) records have been posted for that stream. + """ + + # Only following below 2 streams support pagination + streams_to_test_all_fields = self.expected_streams() + streams_to_test_pagination = {'engage', 'cohort_members'} + + expected_automatic_fields = self.expected_automatic_fields() + conn_id = connections.ensure_connection(self) + + found_catalogs = self.run_and_verify_check_mode(conn_id) + + # Table and field selection + test_catalogs_all_fields = [catalog for catalog in found_catalogs + if catalog.get('tap_stream_id') in streams_to_test_all_fields] + + self.perform_and_verify_table_and_field_selection( + conn_id, test_catalogs_all_fields) + + # Grab metadata after performing table-and-field selection to set expectations + # used for asserting all fields are replicated + stream_to_all_catalog_fields = dict() + for catalog in test_catalogs_all_fields: + stream_id, stream_name = catalog['stream_id'], catalog['stream_name'] + catalog_entry = menagerie.get_annotated_schema(conn_id, stream_id) + fields_from_field_level_md = [md_entry['breadcrumb'][1] + for md_entry in catalog_entry['metadata'] + if md_entry['breadcrumb'] != []] + stream_to_all_catalog_fields[stream_name] = set(fields_from_field_level_md) + + record_count_by_stream = self.run_and_verify_sync(conn_id) + + actual_fields_by_stream = runner.examine_target_output_for_fields() + + synced_records = runner.get_records_from_target_output() + + # Verify no unexpected streams were replicated + synced_stream_names = set(synced_records.keys()) + self.assertSetEqual(streams_to_test_all_fields, synced_stream_names) + + # All Fields Test + for stream in streams_to_test_all_fields: + with self.subTest(logging="Primary Functional Test", stream=stream): + + # Expected values + expected_all_keys = stream_to_all_catalog_fields[stream] + expected_automatic_keys = expected_automatic_fields.get(stream, set()) + + # Collect actual values + messages = synced_records.get(stream) + actual_all_keys = set() + for message in messages['messages']: + if message['action'] == 'upsert': + actual_all_keys.update(set(message['data'].keys())) + + # Verify that the automatic fields are sent to the target + self.assertTrue( + actual_fields_by_stream.get(stream, set()).issuperset( + expected_automatic_keys), + msg="The fields sent to the target don't include all automatic fields") + + # Verify that more than just the automatic fields are replicated for each stream. + if stream != "cohort_members": # cohort_member has just 2 key and both are automatic + self.assertGreater(len(expected_all_keys), + len(expected_automatic_keys)) + + self.assertTrue(expected_automatic_keys.issubset( + expected_all_keys), msg=f'{expected_automatic_keys-expected_all_keys} is not in "expected_all_keys"') + + # As we can't find the below fields in the docs and also + # it won't be generated by mixpanel APIs now so expected. + if stream == "export": + expected_all_keys = expected_all_keys - {'labels', 'sampling_factor', 'dataset', 'mp_reserved_duration_s', 'mp_reserved_origin_end', + 'mp_reserved_origin_start', 'mp_reserved_event_count'} + + # Verify all fields for each stream are replicated + # Skip engage as it return records in random manner with dynamic fields. + if not stream == "engage": + self.assertSetEqual(expected_all_keys, actual_all_keys) + + # Pagination Test + for stream in streams_to_test_pagination: + with self.subTest(stream=stream): + + # Expected values + expected_primary_keys = self.expected_pks()[stream] + + # Collect actual values + messages = synced_records.get(stream) + primary_keys_list = [tuple([message['data'][expected_pk] for expected_pk in expected_primary_keys]) + for message in messages['messages'] if message['action'] == 'upsert'] + + # Verify that we can paginate with all fields selected + record_count_sync = record_count_by_stream.get(stream, 0) + self.assertGreater(record_count_sync, self.API_LIMIT, + msg="The number of records is not over the stream max limit") + + # Chunk the replicated records (just primary keys) into expected pages + pages = [] + page_count = ceil(len(primary_keys_list) / self.API_LIMIT) + page_size = self.API_LIMIT + for page_index in range(page_count): + page_start = page_index * page_size + page_end = (page_index + 1) * page_size + pages.append(set(primary_keys_list[page_start:page_end])) + + # Verify by primary keys that data is unique for each page + for current_index, current_page in enumerate(pages): + with self.subTest(current_page_primary_keys=current_page): + + for other_index, other_page in enumerate(pages): + if current_index == other_index: + continue # don't compare the page to itself + + self.assertTrue( + current_page.isdisjoint(other_page), msg=f'other_page_primary_keys={other_page}' + ) + + def test_run(self): + # Pagination test for standard server + self.eu_residency = False + self.pagination_test_run() + + # Pagination test for EU residency server + self.eu_residency = True + self.pagination_test_run() diff --git a/tests/tap_tester/test_mixpanel_pagination.py b/tests/tap_tester/test_mixpanel_pagination.py deleted file mode 100644 index 432ca19..0000000 --- a/tests/tap_tester/test_mixpanel_pagination.py +++ /dev/null @@ -1,107 +0,0 @@ -from math import ceil - -from tap_tester import connections, runner - -from base import TestMixPanelBase - - -class MixPanelPaginationTest(TestMixPanelBase): - - @staticmethod - def name(): - return "tap_tester_mixpanel_pagination_test" - - def pagination_test_run(self): - """ - Pagination Test - • Verify that for each stream you can get multiple pages of data - • Verify no duplicate pages are replicated - • Verify no unexpected streams were replicated - - PREREQUISITE - For EACH stream add enough data that you surpass the limit of a single - fetch of data. For instance if you have a limit of 250 records ensure - that 251 (or more) records have been posted for that stream. - """ - - # Only following below 2 streams support pagination - expected_streams = {"engage", "cohort_members"} - - conn_id = connections.ensure_connection(self) - - found_catalogs = self.run_and_verify_check_mode(conn_id) - - # Table and field selection - test_catalogs_all_fields = [ - catalog - for catalog in found_catalogs - if catalog.get("tap_stream_id") in expected_streams - ] - - self.perform_and_verify_table_and_field_selection( - conn_id, test_catalogs_all_fields - ) - - record_count_by_stream = self.run_and_verify_sync(conn_id) - - synced_records = runner.get_records_from_target_output() - - # Verify no unexpected streams were replicated - synced_stream_names = set(synced_records.keys()) - self.assertSetEqual(expected_streams, synced_stream_names) - - for stream in expected_streams: - with self.subTest(stream=stream): - - # Expected values - expected_primary_keys = self.expected_pks()[stream] - - # Collect actual values - messages = synced_records.get(stream) - primary_keys_list = [ - tuple( - message["data"][expected_pk] - for expected_pk in expected_primary_keys - ) - for message in messages["messages"] - if message["action"] == "upsert" - ] - - # Verify that we can paginate with all fields selected - record_count_sync = record_count_by_stream.get(stream, 0) - self.assertGreater( - record_count_sync, - self.API_LIMIT, - msg="The number of records is not over the stream max limit", - ) - - # Chunk the replicated records (just primary keys) into expected pages - pages = [] - page_count = ceil(len(primary_keys_list) / self.API_LIMIT) - page_size = self.API_LIMIT - for page_index in range(page_count): - page_start = page_index * page_size - page_end = (page_index + 1) * page_size - pages.append(set(primary_keys_list[page_start:page_end])) - - # Verify by primary keys that data is unique for each page - for current_index, current_page in enumerate(pages): - with self.subTest(current_page_primary_keys=current_page): - - for other_index, other_page in enumerate(pages): - if current_index == other_index: - continue # Don't compare the page to itself - - self.assertTrue( - current_page.isdisjoint(other_page), - msg=f"other_page_primary_keys={other_page}", - ) - - def test_run(self): - # Pagination test for standard server - self.eu_residency = False - self.pagination_test_run() - - # Pagination test for EU residency server - self.eu_residency = True - self.pagination_test_run() From 6d374aed515dcbbd6154d67120314b2450ce6823 Mon Sep 17 00:00:00 2001 From: prijendev Date: Fri, 7 Oct 2022 10:43:48 +0530 Subject: [PATCH 06/22] Provided support of service account authentication --- tap_mixpanel/__init__.py | 6 +- tap_mixpanel/client.py | 49 +++++++++++++-- tests/configuration/fixtures.py | 2 +- tests/tap_tester/base.py | 9 +++ .../test_mixpanel_all_fields_pagination.py | 5 ++ tests/unittests/test_error_handling.py | 13 +++- .../test_request_timeout_param_value.py | 2 + .../test_service_account_authentication.py | 61 +++++++++++++++++++ tests/unittests/test_support_eu_endpoints.py | 6 +- 9 files changed, 141 insertions(+), 12 deletions(-) create mode 100644 tests/unittests/test_service_account_authentication.py diff --git a/tap_mixpanel/__init__.py b/tap_mixpanel/__init__.py index 00c5e40..ed677a1 100644 --- a/tap_mixpanel/__init__.py +++ b/tap_mixpanel/__init__.py @@ -17,7 +17,6 @@ REQUEST_TIMEOUT = 300 REQUIRED_CONFIG_KEYS = [ "project_timezone", - "api_secret", "attribution_window", "start_date", "user_agent", @@ -72,7 +71,10 @@ def main(): api_domain = "mixpanel.com" with MixpanelClient( - parsed_args.config["api_secret"], + parsed_args.config.get("api_secret"), + parsed_args.config.get("service_account_username"), + parsed_args.config.get("service_account_secret"), + parsed_args.config.get("project_id"), api_domain, request_timeout, parsed_args.config["user_agent"], diff --git a/tap_mixpanel/client.py b/tap_mixpanel/client.py index 5bc97dc..c8660a5 100644 --- a/tap_mixpanel/client.py +++ b/tap_mixpanel/client.py @@ -134,16 +134,32 @@ class MixpanelClient: """ The client class used for making REST calls to the Mixpanel API. """ - def __init__(self, api_secret, api_domain, request_timeout, user_agent=None): + def __init__(self, api_secret, service_account_username, service_account_secret, project_id, api_domain, + request_timeout, user_agent=None): self.__api_secret = api_secret + self.__service_account_username = service_account_username + self.__service_account_secret = service_account_secret + self.__project_id = project_id self.__api_domain = api_domain self.__request_timeout = request_timeout self.__user_agent = user_agent self.__session = requests.Session() self.__verified = False + self.auth_header = None self.disable_engage_endpoint = False def __enter__(self): + """ + Set auth_header with provided credentials. If credentials is not provided, then raise the exception. + """ + if self.__api_secret: + self.auth_header = f"Basic {str(base64.urlsafe_b64encode(self.__api_secret.encode('utf-8')), 'utf-8')}" + elif self.__service_account_username and self.__service_account_secret: + service_account_auth = f"{self.__service_account_username}:{self.__service_account_secret}" + self.auth_header = f"Basic {str(base64.urlsafe_b64encode(service_account_auth.encode('utf-8')), 'utf-8')}" + else: + raise Exception("Error: Missing api_secret or service account username/secret in tap config.json") + self.__verified = self.check_access() return self @@ -167,9 +183,8 @@ def check_access(self): bool: Returns true if credentials are verified. (else raises Exception) """ - if self.__api_secret is None: - raise Exception("Error: Missing api_secret in tap config.json.") headers = {} + params = {} # Endpoint: simple API call to return a single record (org settings) to test access url = f"https://{self.__api_domain}/api/2.0/engage" if self.__user_agent: @@ -177,14 +192,24 @@ def check_access(self): headers["Accept"] = "application/json" headers[ "Authorization" - ] = f"Basic {str(base64.urlsafe_b64encode(self.__api_secret.encode('utf-8')), 'utf-8')}" + ] = self.auth_header + if self.__project_id: + params["project_id"] = self.__project_id try: response = self.__session.get( url=url, + params=params, timeout=self.__request_timeout, # Request timeout parameter headers=headers, ) + + if response.status_code == 403: + LOGGER.error( + "HTTP-error-code: 403, Error: User is not a member of this project: %s or this project is invalid", + self.__project_id) + raise MixpanelForbiddenError from None + except requests.exceptions.Timeout as err: LOGGER.error("TIMEOUT ERROR: %s", str(err)) raise ReadTimeoutError from None @@ -288,9 +313,15 @@ def request(self, method, url=None, path=None, params=None, json=None, **kwargs) if method == "POST": kwargs["headers"]["Content-Type"] = "application/json" + if self.__project_id: + if isinstance(params, dict): + params['project_id'] = self.__project_id + else: + params = f"{params}&project_id={self.__project_id}" + kwargs["headers"][ "Authorization" - ] = f"Basic {str(base64.urlsafe_b64encode(self.__api_secret.encode('utf-8')), 'utf-8')}" + ] = self.auth_header with metrics.http_request_timer(endpoint) as timer: response = self.perform_request( method=method, url=url, params=params, json=json, **kwargs @@ -330,6 +361,12 @@ def request_export( else: endpoint = "export" + if self.__project_id: + if isinstance(params, dict): + params['project_id'] = self.__project_id + else: + params = f"{params}&project_id={self.__project_id}" + if "headers" not in kwargs: kwargs["headers"] = {} @@ -343,7 +380,7 @@ def request_export( kwargs["headers"][ "Authorization" - ] = f"Basic {str(base64.urlsafe_b64encode(self.__api_secret.encode('utf-8')), 'utf-8')}" + ] = self.auth_header with metrics.http_request_timer(endpoint) as timer: response = self.perform_request( method=method, url=url, params=params, json=json, stream=True, **kwargs diff --git a/tests/configuration/fixtures.py b/tests/configuration/fixtures.py index 869f533..a7cf8b4 100644 --- a/tests/configuration/fixtures.py +++ b/tests/configuration/fixtures.py @@ -8,7 +8,7 @@ def mixpanel_client(): # Support of request_timeout have been added. # So, now MixpanelClient accept request_timeout parameter which is mandatory mixpanel_client = MixpanelClient( - "API_SECRET", api_domain="mixpanel.com", request_timeout=1 + "API_SECRET", "username", "secret", "project_id", api_domain="mixpanel.com", request_timeout=1 ) # Pass extra request_timeout parameter mixpanel_client._MixpanelClient__verified = True return mixpanel_client diff --git a/tests/tap_tester/base.py b/tests/tap_tester/base.py index 0cf4558..c130975 100644 --- a/tests/tap_tester/base.py +++ b/tests/tap_tester/base.py @@ -29,6 +29,7 @@ class TestMixPanelBase(BaseCase): start_date = "" end_date = "" eu_residency = True + service_account_authentication = False def tap_name(self): """The name of the tap.""" @@ -81,6 +82,10 @@ def setUp(self): missing_envs = [] if self.eu_residency: creds = {"api_secret": "TAP_MIXPANEL_EU_RESIDENCY_API_SECRET"} + elif self.service_account_authentication: + creds = {"service_account_username": "TAP_MIXPANEL_SERVICE_ACCOUNT_USERNAME", + "service_account_secret": "TAP_MIXPANEL_SERVICE_ACCOUNT_SECRET", + "project_id": "TAP_MIXPANEL_SERVICE_ACCOUNT_PROJECT_ID"} else: creds = {"api_secret": "TAP_MIXPANEL_API_SECRET"} @@ -132,6 +137,10 @@ def get_credentials(self): credentials_dict = {} if self.eu_residency: creds = {"api_secret": "TAP_MIXPANEL_EU_RESIDENCY_API_SECRET"} + elif self.service_account_authentication: + creds = {"service_account_username": "TAP_MIXPANEL_SERVICE_ACCOUNT_USERNAME", + "service_account_secret": "TAP_MIXPANEL_SERVICE_ACCOUNT_SECRET", + "project_id": "TAP_MIXPANEL_SERVICE_ACCOUNT_PROJECT_ID"} else: creds = {"api_secret": "TAP_MIXPANEL_API_SECRET"} diff --git a/tests/tap_tester/test_mixpanel_all_fields_pagination.py b/tests/tap_tester/test_mixpanel_all_fields_pagination.py index c4e8034..fb54f26 100644 --- a/tests/tap_tester/test_mixpanel_all_fields_pagination.py +++ b/tests/tap_tester/test_mixpanel_all_fields_pagination.py @@ -149,6 +149,11 @@ def test_run(self): self.eu_residency = False self.pagination_test_run() + # Pagination test with service account credentials + self.service_account_authentication = True + self.pagination_test_run() + self.service_account_authentication = False + # Pagination test for EU residency server self.eu_residency = True self.pagination_test_run() diff --git a/tests/unittests/test_error_handling.py b/tests/unittests/test_error_handling.py index d4fc94e..a23a0e8 100644 --- a/tests/unittests/test_error_handling.py +++ b/tests/unittests/test_error_handling.py @@ -121,6 +121,9 @@ def test_perform_request_exception_handling( mock_request.return_value = mock_response mock_client = client.MixpanelClient( api_secret="mock_api_secret", + service_account_username="mock_service_account_username", + service_account_secret="service_account_secret", + project_id="project_id", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT, ) @@ -137,7 +140,6 @@ def test_perform_request_exception_handling( ["400 different timezone error", 400, mock_400_different_timezone(), client.MixpanelBadRequestError, "A validation exception has occurred. Please validate the timezone with the MixPanel UI under project settings."], ["400 timeout error", 400, MockResponse(400, text=timeout_400_error), client.MixpanelBadRequestError, "Timeout Error.(Please verify your credentials.)"], ["401 error", 401, MockResponse(401), client.MixpanelUnauthorizedError, "Invalid authorization credentials."], - ["403 error", 403, MockResponse(403), client.MixpanelForbiddenError, "User does not have permission to access the resource."], ["404 error", 404, MockResponse(404), client.MixpanelNotFoundError, "The resource you have specified cannot be found."], ["404 error", 404, mock_send_error(), client.MixpanelNotFoundError, "Resource not found error message from API response field 'error'."], ["404 error", 404, mock_send_message(), client.MixpanelNotFoundError, "Resource not found error message from API response field 'message'."], @@ -155,6 +157,9 @@ def test_check_access_exception_handling( mock_request.return_value = mock_response mock_client = client.MixpanelClient( api_secret="mock_api_secret", + service_account_username="mock_service_account_username", + service_account_secret="service_account_secret", + project_id="project_id", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT, ) @@ -180,6 +185,9 @@ def test_request_with_handling_for_5xx_exception_handling( mock_request.return_value = mock_response mock_client = client.MixpanelClient( api_secret="mock_api_secret", + service_account_username="mock_service_account_username", + service_account_secret="service_account_secret", + project_id="project_id", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT, ) @@ -194,6 +202,9 @@ def test_check_access_handle_timeout_error(self, mock_request, mock_sleep): """ mock_client = client.MixpanelClient( api_secret="mock_api_secret", + service_account_username="mock_service_account_username", + service_account_secret="service_account_secret", + project_id="project_id", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT, ) diff --git a/tests/unittests/test_request_timeout_param_value.py b/tests/unittests/test_request_timeout_param_value.py index e9ca91b..ef1dfd7 100644 --- a/tests/unittests/test_request_timeout_param_value.py +++ b/tests/unittests/test_request_timeout_param_value.py @@ -75,6 +75,7 @@ def test_request_timeout_for_none_param_value( "https://mixpanel.com/api/2.0/engage", allow_redirects=True, headers=HEADER, + params={}, timeout=REQUEST_TIMEOUT_DEFAULT, ) @@ -102,6 +103,7 @@ def test_request_timeout( "GET", "https://mixpanel.com/api/2.0/engage", allow_redirects=True, + params={}, headers=HEADER, timeout=expected_value, ) diff --git a/tests/unittests/test_service_account_authentication.py b/tests/unittests/test_service_account_authentication.py new file mode 100644 index 0000000..660b99d --- /dev/null +++ b/tests/unittests/test_service_account_authentication.py @@ -0,0 +1,61 @@ +import unittest +from unittest import mock +from tests.unittests.test_error_handling import MockResponse +from tap_mixpanel.client import MixpanelClient, MixpanelForbiddenError + +class TestServiceAccountAuthentication(unittest.TestCase): + """ + Test that tap do authentication with service account credentials without any error if it is provided. + """ + + @mock.patch("tap_mixpanel.client.MixpanelClient.check_access") + def test_token_creds(self, mock_check_access): + """Test authentication with token credentials(api_secret). + + Args: + mock_check_access: Mock the check_access method to test authentication. + """ + with MixpanelClient("api_secret", None, None, None,"api_domain", 300) as client_: + pass + + self.assertEqual(client_.auth_header, "Basic YXBpX3NlY3JldA==") + + @mock.patch("tap_mixpanel.client.MixpanelClient.check_access") + def test_service_account_creds(self, mock_check_access): + """Test authentication with service account credentials(username, secret). + + Args: + mock_check_access: Mock the check_access method to test authentication. + """ + with MixpanelClient(None, "service_account_username", "service_account_secret", "project_id","api_domain", 300) as client_: + pass + + self.assertEqual(client_.auth_header, "Basic c2VydmljZV9hY2NvdW50X3VzZXJuYW1lOnNlcnZpY2VfYWNjb3VudF9zZWNyZXQ=") + + @mock.patch("tap_mixpanel.client.MixpanelClient.check_access") + def test_no_creds(self, mock_check_access): + """Test that tap throws an error if credentials is not provided. + + Args: + mock_check_access: Mock the check_access method to test authentication. + """ + with self.assertRaises(Exception) as e: + with MixpanelClient(None, None, None, None,"api_domain", 300) as client_: + pass + + self.assertEqual(str(e.exception), "Error: Missing api_secret or service account username/secret in tap config.json") + + @mock.patch("requests.Session.request", return_value = MockResponse(403)) + @mock.patch("tap_mixpanel.client.LOGGER.error") + def test_check_access_403_error_for_service_account_creds(self, mock_logger, mock_request): + """Test that tap handles 403 error with proper message. + + Args: + mock_logger: Mock of LOGGER to verify the logger message + mock_request: Mock Session.request to explicitly raise the forbidden(403) error. + """ + with self.assertRaises(MixpanelForbiddenError): + with MixpanelClient(None, "service_account_username", "service_account_secret", "project_id","api_domain", 300) as client_: + client_.check_access() + + mock_logger.assert_called_with('HTTP-error-code: 403, Error: User is not a member of this project: %s or this project is invalid', 'project_id') diff --git a/tests/unittests/test_support_eu_endpoints.py b/tests/unittests/test_support_eu_endpoints.py index 3438276..a83204e 100644 --- a/tests/unittests/test_support_eu_endpoints.py +++ b/tests/unittests/test_support_eu_endpoints.py @@ -98,7 +98,7 @@ def test_support_eu_endpoints_except_export( state = {} catalog = MockCatalog("revenue") - client = MixpanelClient("", "", "") + client = MixpanelClient("", "", "", "", "", "") revenue_obj = Revenue(client) revenue_obj.sync( catalog=catalog, @@ -151,7 +151,7 @@ def test_support_export_eu_endpoint( state = {} catalog = MockCatalog("export") - client = MixpanelClient("", "", "") + client = MixpanelClient("", "", "", "", "", "") export_obj = Export(client) export_obj.sync( catalog=catalog, @@ -216,6 +216,7 @@ def test_support_eu_endpoint_in_discover( "GET", "https://eu.mixpanel.com/api/2.0/engage", allow_redirects=True, + params={}, headers=header, timeout=300, ) @@ -231,5 +232,6 @@ def test_support_eu_endpoint_in_discover( "https://mixpanel.com/api/2.0/engage", allow_redirects=True, headers=header, + params={}, timeout=300, ) From fff35437813e07de12fe72c5323008e2bdc320f8 Mon Sep 17 00:00:00 2001 From: prijendev Date: Fri, 7 Oct 2022 11:51:34 +0530 Subject: [PATCH 07/22] Fixed pylint issue. --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 74b8317..5cd004f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -20,7 +20,7 @@ jobs: command: | source /usr/local/share/virtualenvs/tap-mixpanel/bin/activate pip install pylint - pylint tap_mixpanel -d 'broad-except,chained-comparison,empty-docstring,fixme,invalid-name,line-too-long,missing-module-docstring,no-else-raise,no-else-return,too-few-public-methods,too-many-arguments,too-many-branches,too-many-lines,too-many-locals,ungrouped-imports,too-many-public-methods,protected-access,too-many-statements,not-an-iterable' + pylint tap_mixpanel -d 'broad-except,chained-comparison,empty-docstring,fixme,invalid-name,line-too-long,missing-module-docstring,no-else-raise,no-else-return,too-few-public-methods,too-many-arguments,too-many-branches,too-many-lines,too-many-locals,ungrouped-imports,too-many-public-methods,protected-access,too-many-statements,not-an-iterable,too-many-instance-attributes' - run: name: 'JSON Validator' command: | From 7c39ce61cd52525b0c393258d7e9fdb86aa8522a Mon Sep 17 00:00:00 2001 From: prijendev Date: Fri, 7 Oct 2022 12:03:55 +0530 Subject: [PATCH 08/22] Updated Readme. --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index b2f4d6d..d8bcf88 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,9 @@ More details may be found in the [Mixpanel API Authentication](https://developer - `start_date` - the default value to use if no bookmark exists for an endpoint (rfc3339 date string) - `user_agent` (string, optional): Process and email for API logging purposes. Example: `tap-mixpanel ` - `api_secret` (string, `ABCdef123`): an API secret for each project in Mixpanel. This can be found in the Mixpanel Console, upper-right Settings (gear icon), Organization Settings > Projects and in the Access Keys section. For this tap, only the api_secret is needed (the api_key is legacy and the token is used only for uploading data). Each Mixpanel project has a different api_secret; therefore each Singer tap pipeline instance is for a single project. + - `service_account_username` (string, `username12`): Username of the service account. + - `service_account_secret` (string, `ABCdef123`): Secret of the service account. + - `project_id` (string, `10451202`): Id of the project which is connected to the provided service account. - `date_window_size` (integer, `30`): Number of days for date window looping through transactional endpoints with from_date and to_date. Default date_window_size is 30 days. Clients with large volumes of events may want to decrease this to 14, 7, or even down to 1-2 days. - `attribution_window` (integer, `5`): Latency minimum number of days to look-back to account for delays in attributing accurate results. [Default attribution window is 5 days](https://help.mixpanel.com/hc/en-us/articles/115004616486-Tracking-If-Users-Are-Offline). - `project_timezone` (string like `US/Pacific`): Time zone in which integer date times are stored. The project timezone may be found in the project settings in the Mixpanel console. [More info about timezones](https://help.mixpanel.com/hc/en-us/articles/115004547203-Manage-Timezones-for-Projects-in-Mixpanel). From cd74b6a81376712f7fb48bebada18777e657dad7 Mon Sep 17 00:00:00 2001 From: kethan1122 Date: Thu, 15 Jun 2023 10:15:18 +0530 Subject: [PATCH 09/22] fix pylint --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index e45b7bd..38d3008 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -23,7 +23,7 @@ jobs: source /usr/local/share/virtualenvs/tap-mixpanel/bin/activate source dev_env.sh pip install pylint - pylint tap_mixpanel -d "$PYLINT_DISABLE_LIST,too-many-statements,protected-access,redefined-builtin" + pylint tap_mixpanel -d "$PYLINT_DISABLE_LIST,too-many-statements,protected-access,redefined-builtin,too-many-instance-attributes" - run: name: 'JSON Validator' command: | From 52b02dd023e2d41a704a8f09ccdfb38eabea6c1f Mon Sep 17 00:00:00 2001 From: kethan1122 Date: Thu, 15 Jun 2023 10:20:01 +0530 Subject: [PATCH 10/22] generate catalog if not configured --- tap_mixpanel/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tap_mixpanel/__init__.py b/tap_mixpanel/__init__.py index e455fbf..1c66e6c 100644 --- a/tap_mixpanel/__init__.py +++ b/tap_mixpanel/__init__.py @@ -89,11 +89,14 @@ def main(): if parsed_args.discover: do_discover(client, properties_flag) - elif parsed_args.catalog: + else: + catalog = parsed_args.catalog + if not catalog: + catalog = _discover(client, properties_flag) _sync( client=client, config=config, - catalog=parsed_args.catalog, + catalog=catalog, state=state, start_date=start_date, ) From 76fc739b29f41c49e9e56ed99f593fe9c63c85f5 Mon Sep 17 00:00:00 2001 From: kethan1122 Date: Thu, 15 Jun 2023 10:47:05 +0530 Subject: [PATCH 11/22] keep api_domain as attribute for client --- tap_mixpanel/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tap_mixpanel/__init__.py b/tap_mixpanel/__init__.py index 1c66e6c..a6fdfad 100644 --- a/tap_mixpanel/__init__.py +++ b/tap_mixpanel/__init__.py @@ -85,6 +85,7 @@ def main(): state = parsed_args.state config = parsed_args.config + client.__api_domain = api_domain properties_flag = config.get("select_properties_by_default") if parsed_args.discover: From 20c3dd9add0d08fcb8edba14243559be3976090d Mon Sep 17 00:00:00 2001 From: kethan1122 Date: Thu, 15 Jun 2023 11:12:43 +0530 Subject: [PATCH 12/22] remove redundant code in unittests --- tests/unittests/test_transform_event_times.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/unittests/test_transform_event_times.py b/tests/unittests/test_transform_event_times.py index 10531c7..aeef637 100644 --- a/tests/unittests/test_transform_event_times.py +++ b/tests/unittests/test_transform_event_times.py @@ -4,10 +4,6 @@ from datetime import datetime from tap_mixpanel.transform import transform_event_times -import pytz - -from tap_mixpanel.transform import transform_event_times - UTC = pytz.utc From 6629ef2d30faba5863c36b65c7d861aabbf7d469 Mon Sep 17 00:00:00 2001 From: kethan1122 Date: Thu, 15 Jun 2023 12:18:59 +0530 Subject: [PATCH 13/22] add auth_type for auth configuration --- tap_mixpanel/__init__.py | 5 +++++ tap_mixpanel/client.py | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tap_mixpanel/__init__.py b/tap_mixpanel/__init__.py index a6fdfad..e2e1c91 100644 --- a/tap_mixpanel/__init__.py +++ b/tap_mixpanel/__init__.py @@ -70,6 +70,10 @@ def main(): else: api_domain = "mixpanel.com" + auth_type = parsed_args.config.get("auth_type") + if not auth_type: + auth_type = "project_secret" + with MixpanelClient( parsed_args.config.get("api_secret"), parsed_args.config.get("service_account_username"), @@ -78,6 +82,7 @@ def main(): api_domain, request_timeout, parsed_args.config["user_agent"], + auth_type ) as client: state = {} diff --git a/tap_mixpanel/client.py b/tap_mixpanel/client.py index 5a9cc03..1e1e837 100644 --- a/tap_mixpanel/client.py +++ b/tap_mixpanel/client.py @@ -136,7 +136,7 @@ class MixpanelClient: The client class used for making REST calls to the Mixpanel API. """ def __init__(self, api_secret, service_account_username, service_account_secret, project_id, api_domain, - request_timeout, user_agent=None): + request_timeout, user_agent=None, auth_type='project_secret'): self.__api_secret = api_secret self.__service_account_username = service_account_username self.__service_account_secret = service_account_secret @@ -145,6 +145,7 @@ def __init__(self, api_secret, service_account_username, service_account_secret, self.__request_timeout = request_timeout self.__user_agent = user_agent self.__session = requests.Session() + self.__auth_type = auth_type self.__verified = False self.auth_header = None self.disable_engage_endpoint = False @@ -153,7 +154,7 @@ def __enter__(self): """ Set auth_header with provided credentials. If credentials is not provided, then raise the exception. """ - if self.__api_secret: + if self.__auth_type == 'project_secret' and self.__api_secret: self.auth_header = f"Basic {str(base64.urlsafe_b64encode(self.__api_secret.encode('utf-8')), 'utf-8')}" elif self.__service_account_username and self.__service_account_secret: service_account_auth = f"{self.__service_account_username}:{self.__service_account_secret}" From eea048041a4e2a5fed2fba13ae86bac70f20cbb6 Mon Sep 17 00:00:00 2001 From: kethan1122 Date: Thu, 15 Jun 2023 14:46:29 +0530 Subject: [PATCH 14/22] merge conflict --- tests/tap_tester/test_mixpanel_bookmark.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/tap_tester/test_mixpanel_bookmark.py b/tests/tap_tester/test_mixpanel_bookmark.py index 032409b..ffec320 100644 --- a/tests/tap_tester/test_mixpanel_bookmark.py +++ b/tests/tap_tester/test_mixpanel_bookmark.py @@ -135,11 +135,7 @@ def bookmark_test_run(self): replication_key_value, first_bookmark_value_utc, msg="First sync bookmark was set incorrectly," -<<<<<<< HEAD "a record with a greater replication-key value was synced.", -======= - " a record with a greater replication-key value was synced.", ->>>>>>> master ) for record in second_sync_messages: From 3b33c20abcb9dbb4db4985f94590116d09b77295 Mon Sep 17 00:00:00 2001 From: kethan1122 Date: Thu, 15 Jun 2023 15:08:07 +0530 Subject: [PATCH 15/22] update unittests --- tests/unittests/test_error_handling.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unittests/test_error_handling.py b/tests/unittests/test_error_handling.py index 2d58efd..603be44 100644 --- a/tests/unittests/test_error_handling.py +++ b/tests/unittests/test_error_handling.py @@ -143,7 +143,6 @@ def test_perform_request_exception_handling( ["400 different timezone error", 400, mock_400_different_timezone(), client.MixpanelBadRequestError, "A validation exception has occurred. Please validate the timezone with the MixPanel UI under project settings."], ["400 timeout error", 400, MockResponse(400, text=timeout_400_error), client.MixpanelBadRequestError, "Timeout Error.(Please verify your credentials.)"], ["401 error", 401, MockResponse(401), client.MixpanelUnauthorizedError, "Invalid authorization credentials."], - # ["403 error", 403, MockResponse(403), client.MixpanelForbiddenError, "User is not a member of this project: project_id or this project is invalid"], ["404 error", 404, MockResponse(404), client.MixpanelNotFoundError, "The resource you have specified cannot be found."], ["404 error", 404, mock_send_error(), client.MixpanelNotFoundError, "Resource not found error message from API response field 'error'."], ["404 error", 404, mock_send_message(), client.MixpanelNotFoundError, "Resource not found error message from API response field 'message'."], From dd67021ac890d42bba220d12b9efeccd7ce1efb7 Mon Sep 17 00:00:00 2001 From: kethan1122 Date: Thu, 15 Jun 2023 15:10:05 +0530 Subject: [PATCH 16/22] update unittests --- tests/unittests/test_error_handling.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unittests/test_error_handling.py b/tests/unittests/test_error_handling.py index 603be44..357414a 100644 --- a/tests/unittests/test_error_handling.py +++ b/tests/unittests/test_error_handling.py @@ -165,6 +165,7 @@ def test_check_access_exception_handling( project_id="project_id", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT, + auth_type="saa" ) with self.assertRaises(error) as e: mock_client.check_access() @@ -197,6 +198,7 @@ def test_request_with_handling_for_5xx_exception_handling( project_id="project_id", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT, + auth_type="saa" ) with self.assertRaises(error): mock_client.perform_request("GET") @@ -214,6 +216,7 @@ def test_check_access_handle_timeout_error(self, mock_request, mock_sleep): project_id="project_id", api_domain="mock_api_domain", request_timeout=REQUEST_TIMEOUT, + auth_type="saa" ) with self.assertRaises(client.ReadTimeoutError): mock_client.check_access() From 11fe4b41faa3e20affc1650ac846ee9a4291d612 Mon Sep 17 00:00:00 2001 From: kethan1122 Date: Thu, 15 Jun 2023 19:33:23 +0530 Subject: [PATCH 17/22] change project secret as API secret --- tap_mixpanel/__init__.py | 2 +- tap_mixpanel/client.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tap_mixpanel/__init__.py b/tap_mixpanel/__init__.py index e2e1c91..e353734 100644 --- a/tap_mixpanel/__init__.py +++ b/tap_mixpanel/__init__.py @@ -72,7 +72,7 @@ def main(): auth_type = parsed_args.config.get("auth_type") if not auth_type: - auth_type = "project_secret" + auth_type = "api_secret" with MixpanelClient( parsed_args.config.get("api_secret"), diff --git a/tap_mixpanel/client.py b/tap_mixpanel/client.py index 1e1e837..88f04a8 100644 --- a/tap_mixpanel/client.py +++ b/tap_mixpanel/client.py @@ -154,7 +154,7 @@ def __enter__(self): """ Set auth_header with provided credentials. If credentials is not provided, then raise the exception. """ - if self.__auth_type == 'project_secret' and self.__api_secret: + if self.__auth_type == 'api_secret' and self.__api_secret: self.auth_header = f"Basic {str(base64.urlsafe_b64encode(self.__api_secret.encode('utf-8')), 'utf-8')}" elif self.__service_account_username and self.__service_account_secret: service_account_auth = f"{self.__service_account_username}:{self.__service_account_secret}" From a2f49331cc0b3330dd02f9b02cc909950da6337b Mon Sep 17 00:00:00 2001 From: kethan1122 Date: Thu, 15 Jun 2023 20:37:48 +0530 Subject: [PATCH 18/22] change default value --- tap_mixpanel/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_mixpanel/client.py b/tap_mixpanel/client.py index 88f04a8..c747f4a 100644 --- a/tap_mixpanel/client.py +++ b/tap_mixpanel/client.py @@ -136,7 +136,7 @@ class MixpanelClient: The client class used for making REST calls to the Mixpanel API. """ def __init__(self, api_secret, service_account_username, service_account_secret, project_id, api_domain, - request_timeout, user_agent=None, auth_type='project_secret'): + request_timeout, user_agent=None, auth_type='api_secret'): self.__api_secret = api_secret self.__service_account_username = service_account_username self.__service_account_secret = service_account_secret From 7c8a2264116875e41a0312df8203c9c5dbbe4874 Mon Sep 17 00:00:00 2001 From: VishalP <20889199+Vi6hal@users.noreply.github.com> Date: Tue, 4 Jul 2023 23:57:06 +0530 Subject: [PATCH 19/22] added config error and minor enhancement --- tap_mixpanel/__init__.py | 5 +++-- tap_mixpanel/client.py | 35 +++++++++++++---------------------- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/tap_mixpanel/__init__.py b/tap_mixpanel/__init__.py index e353734..9054d47 100644 --- a/tap_mixpanel/__init__.py +++ b/tap_mixpanel/__init__.py @@ -70,8 +70,9 @@ def main(): else: api_domain = "mixpanel.com" - auth_type = parsed_args.config.get("auth_type") - if not auth_type: + auth_type = parsed_args.config.get("auth_type","").lower() + # default to api_secret as authentication_type + if auth_type not in ("saa","api_secret"): auth_type = "api_secret" with MixpanelClient( diff --git a/tap_mixpanel/client.py b/tap_mixpanel/client.py index c747f4a..15f041b 100644 --- a/tap_mixpanel/client.py +++ b/tap_mixpanel/client.py @@ -17,6 +17,8 @@ class ReadTimeoutError(Exception): """Custom error for request timeout.""" +class ConfigurationError(Exception): + """Custom error for incorrect configuration""" class Server5xxError(Exception): """Custom error class for all the 5xx error.""" @@ -156,12 +158,12 @@ def __enter__(self): """ if self.__auth_type == 'api_secret' and self.__api_secret: self.auth_header = f"Basic {str(base64.urlsafe_b64encode(self.__api_secret.encode('utf-8')), 'utf-8')}" - elif self.__service_account_username and self.__service_account_secret: + elif self.__auth_type == 'saa' and self.__service_account_username and self.__service_account_secret: service_account_auth = f"{self.__service_account_username}:{self.__service_account_secret}" self.auth_header = f"Basic {str(base64.urlsafe_b64encode(service_account_auth.encode('utf-8')), 'utf-8')}" else: - raise Exception("Error: Missing api_secret or service account username/secret in tap config.json") - + raise ConfigurationError("Error: Missing api_secret or service account username/secret in tap config.json") + print(self.__auth_type) self.__verified = self.check_access() return self @@ -192,9 +194,7 @@ def check_access(self): if self.__user_agent: headers["User-Agent"] = self.__user_agent headers["Accept"] = "application/json" - headers[ - "Authorization" - ] = self.auth_header + headers["Authorization"] = self.auth_header if self.__project_id: params["project_id"] = self.__project_id @@ -321,9 +321,7 @@ def request(self, method, url=None, path=None, params=None, json=None, **kwargs) else: params = f"{params}&project_id={self.__project_id}" - kwargs["headers"][ - "Authorization" - ] = self.auth_header + kwargs["headers"]["Authorization"] = self.auth_header with metrics.http_request_timer(endpoint) as timer: response = self.perform_request( method=method, url=url, params=params, json=json, **kwargs @@ -349,19 +347,15 @@ def request_export( Yields: dict: Records of export stream. """ - if not self.__verified: - self.__verified = self.check_access() + + self.__verified = self.__verified if self.__verified else self.check_access() if url and path: url = f"{url}/{path}" elif path and not url: url = f"https://{self.__api_domain}/api/2.0/{path}" - - if "endpoint" in kwargs: - endpoint = kwargs["endpoint"] - del kwargs["endpoint"] - else: - endpoint = "export" + + endpoint = kwargs.pop("endpoint","export") if self.__project_id: if isinstance(params, dict): @@ -369,8 +363,7 @@ def request_export( else: params = f"{params}&project_id={self.__project_id}" - if "headers" not in kwargs: - kwargs["headers"] = {} + kwargs["headers"] = kwargs.get("headers",{}) kwargs["headers"]["Accept"] = "application/json" @@ -380,9 +373,7 @@ def request_export( if method == "POST": kwargs["headers"]["Content-Type"] = "application/json" - kwargs["headers"][ - "Authorization" - ] = self.auth_header + kwargs["headers"]["Authorization"] = self.auth_header with metrics.http_request_timer(endpoint) as timer: response = self.perform_request( method=method, url=url, params=params, json=json, stream=True, **kwargs From de0239c35f31682738039a9c316fe7264a2b349f Mon Sep 17 00:00:00 2001 From: VishalP <20889199+Vi6hal@users.noreply.github.com> Date: Wed, 5 Jul 2023 00:04:40 +0530 Subject: [PATCH 20/22] fixed pylint --- README.md | 1 + tap_mixpanel/client.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1d10b8c..2f623d5 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,7 @@ More details may be found in the [Mixpanel API Authentication](https://developer 3. Create your tap's `config.json` file. The tap config file for this tap should include these entries: - `start_date` - the default value to use if no bookmark exists for an endpoint (rfc3339 date string) - `user_agent` (string, optional): Process and email for API logging purposes. Example: `tap-mixpanel ` + - `auth_type` (`saa` or `api_secret`): Used to toggle between [service account authentication](https://developer.mixpanel.com/reference/service-accounts) and [api secret based authentication](https://docs.mixpanel.com/docs/tracking/how-tos/api-credentials#api-secret), it is recommended by mixpanel to use service account authentication - `api_secret` (string, `ABCdef123`): an API secret for each project in Mixpanel. This can be found in the Mixpanel Console, upper-right Settings (gear icon), Organization Settings > Projects and in the Access Keys section. For this tap, only the api_secret is needed (the api_key is legacy and the token is used only for uploading data). Each Mixpanel project has a different api_secret; therefore each Singer tap pipeline instance is for a single project. - `service_account_username` (string, `username12`): Username of the service account. - `service_account_secret` (string, `ABCdef123`): Secret of the service account. diff --git a/tap_mixpanel/client.py b/tap_mixpanel/client.py index 15f041b..e2e6bfd 100644 --- a/tap_mixpanel/client.py +++ b/tap_mixpanel/client.py @@ -163,7 +163,7 @@ def __enter__(self): self.auth_header = f"Basic {str(base64.urlsafe_b64encode(service_account_auth.encode('utf-8')), 'utf-8')}" else: raise ConfigurationError("Error: Missing api_secret or service account username/secret in tap config.json") - print(self.__auth_type) + self.__verified = self.check_access() return self @@ -354,7 +354,7 @@ def request_export( url = f"{url}/{path}" elif path and not url: url = f"https://{self.__api_domain}/api/2.0/{path}" - + endpoint = kwargs.pop("endpoint","export") if self.__project_id: From f2098aad8a68670dc14f2ddd69fe24f80aaef8a8 Mon Sep 17 00:00:00 2001 From: VishalP <20889199+Vi6hal@users.noreply.github.com> Date: Wed, 5 Jul 2023 00:26:55 +0530 Subject: [PATCH 21/22] fixed ut --- tests/unittests/test_service_account_authentication.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unittests/test_service_account_authentication.py b/tests/unittests/test_service_account_authentication.py index 660b99d..c75189d 100644 --- a/tests/unittests/test_service_account_authentication.py +++ b/tests/unittests/test_service_account_authentication.py @@ -27,7 +27,7 @@ def test_service_account_creds(self, mock_check_access): Args: mock_check_access: Mock the check_access method to test authentication. """ - with MixpanelClient(None, "service_account_username", "service_account_secret", "project_id","api_domain", 300) as client_: + with MixpanelClient(None, "service_account_username", "service_account_secret", "project_id","api_domain", 300, auth_type="saa") as client_: pass self.assertEqual(client_.auth_header, "Basic c2VydmljZV9hY2NvdW50X3VzZXJuYW1lOnNlcnZpY2VfYWNjb3VudF9zZWNyZXQ=") @@ -55,7 +55,7 @@ def test_check_access_403_error_for_service_account_creds(self, mock_logger, moc mock_request: Mock Session.request to explicitly raise the forbidden(403) error. """ with self.assertRaises(MixpanelForbiddenError): - with MixpanelClient(None, "service_account_username", "service_account_secret", "project_id","api_domain", 300) as client_: + with MixpanelClient(None, "service_account_username", "service_account_secret", "project_id","api_domain", 300, auth_type="saa") as client_: client_.check_access() mock_logger.assert_called_with('HTTP-error-code: 403, Error: User is not a member of this project: %s or this project is invalid', 'project_id') From 159e9ace70bf7d963e80b6c1128b59e47b43a4b1 Mon Sep 17 00:00:00 2001 From: Vi6hal <20889199+Vi6hal@users.noreply.github.com> Date: Wed, 5 Jul 2023 02:12:24 +0000 Subject: [PATCH 22/22] updated tests to run in ssa mode --- tests/tap_tester/test_mixpanel_all_fields_pagination.py | 5 +++++ tests/tap_tester/test_mixpanel_automatic_fields.py | 5 +++++ tests/tap_tester/test_mixpanel_bookmark.py | 5 +++++ tests/tap_tester/test_mixpanel_discovery.py | 5 +++++ tests/tap_tester/test_mixpanel_start_date.py | 6 ++++++ 5 files changed, 26 insertions(+) diff --git a/tests/tap_tester/test_mixpanel_all_fields_pagination.py b/tests/tap_tester/test_mixpanel_all_fields_pagination.py index 42587b0..229a669 100644 --- a/tests/tap_tester/test_mixpanel_all_fields_pagination.py +++ b/tests/tap_tester/test_mixpanel_all_fields_pagination.py @@ -152,3 +152,8 @@ def test_run(self): # Pagination test for EU residency server self.eu_residency = True self.pagination_test_run() + + def test_run_ssa(self): + # perform checks with service account auth + self.service_account_authentication = True + self.pagination_test_run() \ No newline at end of file diff --git a/tests/tap_tester/test_mixpanel_automatic_fields.py b/tests/tap_tester/test_mixpanel_automatic_fields.py index 72d0f2b..18922d8 100644 --- a/tests/tap_tester/test_mixpanel_automatic_fields.py +++ b/tests/tap_tester/test_mixpanel_automatic_fields.py @@ -65,3 +65,8 @@ def test_standard_auto_fields(self): """Automatic fields test for standard server""" self.eu_residency = False self.automatic_fields_test_run() + + def test_run_ssa(self): + # perform checks with service account auth + self.service_account_authentication = True + self.automatic_fields_test_run() \ No newline at end of file diff --git a/tests/tap_tester/test_mixpanel_bookmark.py b/tests/tap_tester/test_mixpanel_bookmark.py index ffec320..2274ba3 100644 --- a/tests/tap_tester/test_mixpanel_bookmark.py +++ b/tests/tap_tester/test_mixpanel_bookmark.py @@ -189,3 +189,8 @@ def test_standard_bookmarks(self): """Bookmark test for standard server.""" self.eu_residency = False self.bookmark_test_run() + + def test_run_ssa(self): + # perform checks with ssa auth + self.service_account_authentication = True + self.bookmark_test_run() \ No newline at end of file diff --git a/tests/tap_tester/test_mixpanel_discovery.py b/tests/tap_tester/test_mixpanel_discovery.py index b578dac..43c73e9 100644 --- a/tests/tap_tester/test_mixpanel_discovery.py +++ b/tests/tap_tester/test_mixpanel_discovery.py @@ -188,3 +188,8 @@ def test_eu_discovery(self): """Discovery test for EU residency server.""" self.eu_residency = True self.discovery_test_run() + + def test_run_ssa(self): + # perform checks with ssa auth + self.service_account_authentication = True + self.discovery_test_run() \ No newline at end of file diff --git a/tests/tap_tester/test_mixpanel_start_date.py b/tests/tap_tester/test_mixpanel_start_date.py index ea819ce..e12ccd7 100644 --- a/tests/tap_tester/test_mixpanel_start_date.py +++ b/tests/tap_tester/test_mixpanel_start_date.py @@ -174,3 +174,9 @@ def test_run(self): # Start date test for standard server self.eu_residency = False self.start_date_test_run() + + def test_run_ssa(self): + # perform checks with ssa auth + self.service_account_authentication = True + self.start_date_test_run() +