diff --git a/.gitignore b/.gitignore
index 7146e46..54792a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,10 @@
-# ignore the data directories
-data/*
-compressed/*
+# ignore the data directory
+data*
+compressed*
 
-# macOS junk files
+# (macOS) junk files
 .DS_Store
 __pycache__
+
+# pipenv
+Pipfile*
diff --git a/fetch_candlesticks.py b/fetch_candlesticks.py
new file mode 100755
index 0000000..f4601ce
--- /dev/null
+++ b/fetch_candlesticks.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+"""
+Download historical candlestick data for all trading pairs on Binance.com.
+All trading pair data is checked for integrity, sorted and saved as a Parquet
+file.
+"""
+
+__author__ = "GOSUTO.AI"
+__version__ = "2.0.0"
+
+import os
+import random
+import time
+from datetime import datetime
+
+import pandas as pd
+import requests
+from progressbar import ProgressBar
+
+import preprocessing as pp
+
+
+# check whether script being run in PyCharm environment
+IN_PYCHARM = "PYCHARM_HOSTED" in os.environ
+
+BATCH_SIZE = 1000  # number of candles to fetch per API request
+SHAVE_OFF_TODAY = True  # shave off candles after last midnight to equalize end-time for all trading pairs
+SKIP_DELISTED = False
+DATA_PATH = "data"
+
+API_BASE = "https://api.binance.com/api/v3/"
+
+LABELS = [
+    "open_time",
+    "open",
+    "high",
+    "low",
+    "close",
+    "volume",
+    "close_time",
+    "quote_asset_volume",
+    "number_of_trades",
+    "taker_buy_base_asset_volume",
+    "taker_buy_quote_asset_volume",
+    "ignore",
+]
+
+
+def get_batch(symbol, interval="1m", start_time=0, limit=1000):
+    """
+    Use a GET request to retrieve a batch of candlesticks. Process the JSON
+    into a pandas dataframe and return it. If not successful, return an empty
+    dataframe.
+    """
+    params = {
+        "symbol": symbol,
+        "interval": interval,
+        "startTime": start_time,
+        "limit": limit,
+    }
+
+    try:
+        response = requests.get(f"{API_BASE}klines", params, timeout=30)
+    except requests.exceptions.ConnectionError:
+        print("Connection error, Cooling down for 5 mins...")
+        time.sleep(5 * 60)
+        return get_batch(symbol, interval, start_time, limit)
+    except requests.exceptions.Timeout:
+        print("Timeout, Cooling down for 5 min...")
+        time.sleep(5 * 60)
+        return get_batch(symbol, interval, start_time, limit)
+
+    if response.status_code == 200:
+        return pd.DataFrame(response.json(), columns=LABELS)
+    print(f"Got erroneous response back: {response}")
+    return pd.DataFrame([])
+
+
+def gather_new_candles(base, quote, last_timestamp, interval="1m", n=0, n_count=0):
+    """
+    Gather all candlesticks available, starting from the last timestamp
+    loaded from disk or from beginning of time. Stop if the timestamp that comes
+    back from the api is the same as the last one.
+    """
+    previous_timestamp = None
+    batches = [pd.DataFrame([], columns=LABELS)]
+
+    first_read = True
+    start_datetime = None
+    bar = None
+    if last_timestamp == 0:
+        print(
+            f"{datetime.now()} {n:04d}/{n_count} Starting from the beginning (no last known timestamp)"
+        )
+    else:
+        print(
+            f"{datetime.now()} {n:04d}/{n_count} Starting from last known timestamp ({datetime.fromtimestamp(last_timestamp / 1000)})"
+        )
+    while previous_timestamp != last_timestamp:
+        previous_timestamp = last_timestamp
+
+        new_batch = get_batch(
+            symbol=base + quote,
+            interval=interval,
+            start_time=last_timestamp + 1,
+            limit=BATCH_SIZE,
+        )
+        # requesting candles from the future returns empty
+        # also stop in case response code was not 200
+        if new_batch.empty:
+            break
+
+        last_timestamp = new_batch["open_time"].max()
+        # sometimes no new trades took place yet on date.today();
+        # in this case the batch contains no new data
+        if previous_timestamp == last_timestamp:
+            break
+
+        batches.append(new_batch)
+
+        # get info for progressbar
+        if first_read:
+            start_datetime = datetime.fromtimestamp(new_batch["open_time"][0] / 1000)
+            missing_data_timedelta = datetime.now() - start_datetime
+            total_minutes_of_data = int(missing_data_timedelta.total_seconds() / 60) + 1
+            if total_minutes_of_data > 1440:
+                missing_data_timedelta = str(missing_data_timedelta).split(",")[0]
+            else:
+                missing_data_timedelta = "24 hours"
+            print(
+                f"{datetime.now()} {n:04d}/{n_count} Fetching all available data from last {missing_data_timedelta} (max {total_minutes_of_data} candles)"
+            )
+            if IN_PYCHARM:
+                time.sleep(0.2)
+            first_read = False
+            if total_minutes_of_data >= BATCH_SIZE * 2:
+                bar = ProgressBar(max_value=total_minutes_of_data).start()
+
+        if bar is not None:
+            time_covered = (
+                datetime.fromtimestamp(last_timestamp / 1000) - start_datetime
+            )
+            minutes_covered = int(time_covered.total_seconds() / 60) + 1
+            bar.max_value = max(bar.max_value, minutes_covered)
+            bar.update(minutes_covered)
+    if bar is not None:
+        bar.finish(dirty=True)
+    if IN_PYCHARM:
+        time.sleep(0.2)
+    return batches
+
+
+def all_candles_to_parquet(base, quote, interval="1m", n=0, n_count=0):
+    """
+    Collect a list of candlestick batches with all candlesticks of a trading
+    pair, concat into a dataframe and write it to parquet.
+    """
+    filepath = f"{DATA_PATH}/{base}-{quote}.parquet"
+
+    last_timestamp, _ = get_parquet_info(filepath)
+    new_candle_batches = gather_new_candles(
+        base, quote, last_timestamp, interval, n, n_count
+    )
+    return write_to_parquet(filepath, new_candle_batches, append=True)
+
+
+def get_parquet_info(filepath):
+    """
+    Reads and returns the last timestamp and number of candles in a parquet
+    file.
+    """
+    last_timestamp = 0
+    old_lines = 0
+    try:
+        existing_data = pd.read_parquet(filepath)
+        if not existing_data.empty:
+            last_timestamp = int(existing_data.index.max().timestamp() * 1000)
+            old_lines = len(existing_data.index)
+    except OSError:
+        pass
+    return last_timestamp, old_lines
+
+
+def write_to_parquet(file, batches, append=False):
+    """
+    Write batches of candle data to a parquet file.
+    """
+    df = pd.concat(batches, ignore_index=True)
+    pp.write_raw_to_parquet(df, file, SHAVE_OFF_TODAY, append=append)
+    return len(df.index)
+
+
+def main():
+    """
+    Main loop; loop over all currency pairs that exist on the exchange.
+    """
+    # get all pairs currently available
+    all_symbols = pd.DataFrame(
+        requests.get(f"{API_BASE}exchangeInfo").json()["symbols"]
+    )
+    active_symbols = all_symbols.loc[all_symbols["status"] == "TRADING"]
+    if SKIP_DELISTED:
+        all_pairs = [
+            tuple(x)
+            for x in active_symbols[["baseAsset", "quoteAsset"]].to_records(index=False)
+        ]
+        n_inactive = len(all_symbols) - len(active_symbols)
+        print(
+            f"{datetime.now()} Got {len(all_pairs)} active pairs from Binance. Dropped {n_inactive} inactive pairs."
+        )
+    else:
+        all_pairs = [
+            tuple(x)
+            for x in all_symbols[["baseAsset", "quoteAsset"]].to_records(index=False)
+        ]
+        print(
+            f"{datetime.now()} Got {len(all_pairs)} pairs from Binance, of which {len(active_symbols)} are active."
+        )
+
+    # randomising order helps during testing and doesn't make any difference in production
+    random.shuffle(all_pairs)
+
+    # make sure data folders exist
+    os.makedirs(f"{DATA_PATH}", exist_ok=True)
+
+    # do a full update on all pairs
+    n_count = len(all_pairs)
+    for n, pair in enumerate(all_pairs, 1):
+        base, quote = pair
+        print(f"{datetime.now()} {n:04d}/{n_count} Updating {base}-{quote}")
+        new_lines = all_candles_to_parquet(base=base, quote=quote, n=n, n_count=n_count)
+        if new_lines > 0:
+            print(
+                f"{datetime.now()} {n:04d}/{n_count} Wrote {new_lines} new candles to file for {base}-{quote}"
+            )
+        else:
+            print(
+                f"{datetime.now()} {n:04d}/{n_count} Already up to date with {base}-{quote}"
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/main.py b/main.py
deleted file mode 100755
index f7bbfef..0000000
--- a/main.py
+++ /dev/null
@@ -1,209 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-"""Download historical candlestick data for all trading pairs on Binance.com.
-All trading pair data is checked for integrity, sorted and saved as both a CSV
-and a Parquet file. The CSV files act as a raw buffer on every update round.
-The Parquet files are much more space efficient (~50GB vs ~10GB) and are
-therefore the files used to upload to Kaggle after each run.
-"""
-
-__author__ = 'GOSUTO.AI'
-
-import json
-import os
-import random
-import subprocess
-import time
-from datetime import date, datetime, timedelta
-
-import requests
-import pandas as pd
-
-import preprocessing as pp
-
-API_BASE = 'https://api.binance.com/api/v3/'
-
-LABELS = [
-    'open_time',
-    'open',
-    'high',
-    'low',
-    'close',
-    'volume',
-    'close_time',
-    'quote_asset_volume',
-    'number_of_trades',
-    'taker_buy_base_asset_volume',
-    'taker_buy_quote_asset_volume',
-    'ignore'
-]
-
-METADATA = {
-    'id': 'jorijnsmit/binance-full-history',
-    'title': 'Binance Full History',
-    'isPrivate': False,
-    'licenses': [{'name': 'other'}],
-    'keywords': [
-        'business',
-        'finance',
-        'investing',
-        'currencies and foreign exchange'
-    ],
-    'collaborators': [],
-    'data': []
-}
-
-def write_metadata(n_count):
-    """Write the metadata file dynamically so we can include a pair count."""
-
-    METADATA['subtitle'] = f'1 minute candlesticks for all {n_count} cryptocurrency pairs'
-    METADATA['description'] = f"""### Introduction\n\nThis is a collection of all 1 minute candlesticks of all cryptocurrency pairs on [Binance.com](https://binance.com). All {n_count} of them are included. Both retrieval and uploading the data is fully automated—see [this GitHub repo](https://github.com/gosuto-ai/candlestick_retriever).\n\n### Content\n\nFor every trading pair, the following fields from [Binance's official API endpoint for historical candlestick data](https://github.com/binance-exchange/binance-official-api-docs/blob/master/rest-api.md#klinecandlestick-data) are saved into a Parquet file:\n\n```\n #   Column                        Dtype         \n---  ------                        -----         \n 0   open_time                     datetime64[ns]\n 1   open                          float32       \n 2   high                          float32       \n 3   low                           float32       \n 4   close                         float32       \n 5   volume                        float32       \n 6   quote_asset_volume            float32       \n 7   number_of_trades              uint16        \n 8   taker_buy_base_asset_volume   float32       \n 9   taker_buy_quote_asset_volume  float32       \ndtypes: datetime64[ns](1), float32(8), uint16(1)\n```\n\nThe dataframe is indexed by `open_time` and sorted from oldest to newest. The first row starts at the first timestamp available on the exchange, which is July 2017 for the longest running pairs.\n\nHere are two simple plots based on a single file; one of the opening price with an added indicator (MA50) and one of the volume and number of trades:\n\n![](https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F2234678%2Fb8664e6f26dc84e9a40d5a3d915c9640%2Fdownload.png?generation=1582053879538546&alt=media)\n![](https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F2234678%2Fcd04ed586b08c1576a7b67d163ad9889%2Fdownload-1.png?generation=1582053899082078&alt=media)\n\n### Inspiration\n\nOne obvious use-case for this data could be technical analysis by adding indicators such as moving averages, MACD, RSI, etc. Other approaches could include backtesting trading algorithms or computing arbitrage potential with other exchanges.\n\n### License\n\nThis data is being collected automatically from crypto exchange Binance."""
-
-    with open('compressed/dataset-metadata.json', 'w') as file:
-        json.dump(METADATA, file, indent=4)
-
-
-def get_batch(symbol, interval='1m', start_time=0, limit=1000):
-    """Use a GET request to retrieve a batch of candlesticks. Process the JSON into a pandas
-    dataframe and return it. If not successful, return an empty dataframe.
-    """
-
-    params = {
-        'symbol': symbol,
-        'interval': interval,
-        'startTime': start_time,
-        'limit': limit
-    }
-    try:
-        # timeout should also be given as a parameter to the function
-        response = requests.get(f'{API_BASE}klines', params, timeout=30)
-    except requests.exceptions.ConnectionError:
-        print('Connection error, Cooling down for 5 mins...')
-        time.sleep(5 * 60)
-        return get_batch(symbol, interval, start_time, limit)
-    
-    except requests.exceptions.Timeout:
-        print('Timeout, Cooling down for 5 min...')
-        time.sleep(5 * 60)
-        return get_batch(symbol, interval, start_time, limit)
-    
-    except requests.exceptions.ConnectionResetError:
-        print('Connection reset by peer, Cooling down for 5 min...')
-        time.sleep(5 * 60)
-        return get_batch(symbol, interval, start_time, limit)
-
-    if response.status_code == 200:
-        return pd.DataFrame(response.json(), columns=LABELS)
-    print(f'Got erroneous response back: {response}')
-    return pd.DataFrame([])
-
-
-def all_candles_to_csv(base, quote, interval='1m'):
-    """Collect a list of candlestick batches with all candlesticks of a trading pair,
-    concat into a dataframe and write it to CSV.
-    """
-
-    # see if there is any data saved on disk already
-    try:
-        batches = [pd.read_csv(f'data/{base}-{quote}.csv')]
-        last_timestamp = batches[-1]['open_time'].max()
-    except FileNotFoundError:
-        batches = [pd.DataFrame([], columns=LABELS)]
-        last_timestamp = 0
-    old_lines = len(batches[-1].index)
-
-    # gather all candlesticks available, starting from the last timestamp loaded from disk or 0
-    # stop if the timestamp that comes back from the api is the same as the last one
-    previous_timestamp = None
-
-    while previous_timestamp != last_timestamp:
-        # stop if we reached data from today
-        if date.fromtimestamp(last_timestamp / 1000) >= date.today():
-            break
-
-        previous_timestamp = last_timestamp
-
-        new_batch = get_batch(
-            symbol=base+quote,
-            interval=interval,
-            start_time=last_timestamp+1
-        )
-
-        # requesting candles from the future returns empty
-        # also stop in case response code was not 200
-        if new_batch.empty:
-            break
-
-        last_timestamp = new_batch['open_time'].max()
-
-        # sometimes no new trades took place yet on date.today();
-        # in this case the batch is nothing new
-        if previous_timestamp == last_timestamp:
-            break
-
-        batches.append(new_batch)
-        last_datetime = datetime.fromtimestamp(last_timestamp / 1000)
-
-        covering_spaces = 20 * ' '
-        print(datetime.now(), base, quote, interval, str(last_datetime)+covering_spaces, end='\r', flush=True)
-
-    # write clean version of csv to parquet
-    parquet_name = f'{base}-{quote}.parquet'
-    full_path = f'compressed/{parquet_name}'
-    df = pd.concat(batches, ignore_index=True)
-    df = pp.quick_clean(df)
-    pp.write_raw_to_parquet(df, full_path)
-    METADATA['data'].append({
-        'description': f'All trade history for the pair {base} and {quote} at 1 minute intervals. Counts {df.index.size} records.',
-        'name': parquet_name,
-        'totalBytes': os.stat(full_path).st_size,
-        'columns': []
-    })
-
-    # in the case that new data was gathered write it to disk
-    if len(batches) > 1:
-        df.to_csv(f'data/{base}-{quote}.csv', index=False)
-        return len(df.index) - old_lines
-    return 0
-
-
-def main():
-    """Main loop; loop over all currency pairs that exist on the exchange. Once done upload the
-    compressed (Parquet) dataset to Kaggle.
-    """
-
-    # get all pairs currently available
-    all_symbols = pd.DataFrame(requests.get(f'{API_BASE}exchangeInfo').json()['symbols'])
-    all_pairs = [tuple(x) for x in all_symbols[['baseAsset', 'quoteAsset']].to_records(index=False)]
-
-    # randomising order helps during testing and doesn't make any difference in production
-    random.shuffle(all_pairs)
-
-    # make sure data folders exist
-    os.makedirs('data', exist_ok=True)
-    os.makedirs('compressed', exist_ok=True)
-
-    # do a full update on all pairs
-    n_count = len(all_pairs)
-    for n, pair in enumerate(all_pairs, 1):
-        base, quote = pair
-        new_lines = all_candles_to_csv(base=base, quote=quote)
-        if new_lines > 0:
-            print(f'{datetime.now()} {n}/{n_count} Wrote {new_lines} new lines to file for {base}-{quote}')
-        else:
-            print(f'{datetime.now()} {n}/{n_count} Already up to date with {base}-{quote}')
-
-    # clean the data folder and upload a new version of the dataset to kaggle
-    try:
-        os.remove('compressed/.DS_Store')
-    except FileNotFoundError:
-        pass
-    write_metadata(n_count)
-    yesterday = date.today() - timedelta(days=1)
-    subprocess.run(['kaggle', 'datasets', 'version', '-p', 'compressed/', '-m', f'full update of all {n_count} pairs up to {str(yesterday)}'])
-    os.remove('compressed/dataset-metadata.json')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/preprocessing.py b/preprocessing.py
deleted file mode 100644
index 7ee3660..0000000
--- a/preprocessing.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import os
-from datetime import date
-
-import pandas as pd
-
-def set_dtypes(df):
-    """
-    set datetimeindex and convert all columns in pd.df to their proper dtype
-    assumes csv is read raw without modifications; pd.read_csv(csv_filename)"""
-
-    df['open_time'] = pd.to_datetime(df['open_time'], unit='ms')
-    df = df.set_index('open_time', drop=True)
-
-    df = df.astype(dtype={
-        'open': 'float64',
-        'high': 'float64',
-        'low': 'float64',
-        'close': 'float64',
-        'volume': 'float64',
-        'close_time': 'datetime64[ms]',
-        'quote_asset_volume': 'float64',
-        'number_of_trades': 'int64',
-        'taker_buy_base_asset_volume': 'float64',
-        'taker_buy_quote_asset_volume': 'float64',
-        'ignore': 'float64'
-    })
-
-    return df
-
-
-def set_dtypes_compressed(df):
-    """Create a `DatetimeIndex` and convert all critical columns in pd.df to a dtype with low
-    memory profile. Assumes csv is read raw without modifications; `pd.read_csv(csv_filename)`."""
-
-    df['open_time'] = pd.to_datetime(df['open_time'], unit='ms')
-    df = df.set_index('open_time', drop=True)
-
-    df = df.astype(dtype={
-        'open': 'float32',
-        'high': 'float32',
-        'low': 'float32',
-        'close': 'float32',
-        'volume': 'float32',
-        'number_of_trades': 'uint16',
-        'quote_asset_volume': 'float32',
-        'taker_buy_base_asset_volume': 'float32',
-        'taker_buy_quote_asset_volume': 'float32'
-    })
-
-    return df
-
-
-def assert_integrity(df):
-    """make sure no rows have empty cells or duplicate timestamps exist"""
-
-    assert df.isna().all(axis=1).any() == False
-    assert df['open_time'].duplicated().any() == False
-
-
-def quick_clean(df):
-    """clean a raw dataframe"""
-
-    # drop dupes
-    dupes = df['open_time'].duplicated().sum()
-    if dupes > 0:
-        df = df[df['open_time'].duplicated() == False]
-
-    # sort by timestamp, oldest first
-    df.sort_values(by=['open_time'], ascending=False)
-
-    # just a doublcheck
-    assert_integrity(df)
-
-    return df
-
-
-def write_raw_to_parquet(df, full_path):
-    """takes raw df and writes a parquet to disk"""
-
-    # some candlesticks do not span a full minute
-    # these points are not reliable and thus filtered
-    df = df[~(df['open_time'] - df['close_time'] != -59999)]
-
-    # `close_time` column has become redundant now, as is the column `ignore`
-    df = df.drop(['close_time', 'ignore'], axis=1)
-
-    df = set_dtypes_compressed(df)
-
-    # give all pairs the same nice cut-off
-    df = df[df.index < str(date.today())]
-
-    df.to_parquet(full_path)
-
-
-def groom_data(dirname='data'):
-    """go through data folder and perform a quick clean on all csv files"""
-
-    for filename in os.listdir(dirname):
-        if filename.endswith('.csv'):
-            full_path = f'{dirname}/{filename}'
-            quick_clean(pd.read_csv(full_path)).to_csv(full_path)
-
-
-def compress_data(dirname='data'):
-    """go through data folder and rewrite csv files to parquets"""
-
-    os.makedirs('compressed', exist_ok=True)
-    for filename in os.listdir(dirname):
-        if filename.endswith('.csv'):
-            full_path = f'{dirname}/{filename}'
-
-            df = pd.read_csv(full_path)
-
-            new_filename = filename.replace('.csv', '.parquet')
-            new_full_path = f'compressed/{new_filename}'
-            write_raw_to_parquet(df, new_full_path)
diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py
new file mode 100644
index 0000000..fa32fe2
--- /dev/null
+++ b/preprocessing/__init__.py
@@ -0,0 +1,83 @@
+from datetime import date
+
+import pandas as pd
+
+
+def set_dtypes_compressed(df):
+    """
+    Create a `DatetimeIndex` on a raw pd.df and convert all critical columns
+    to a dtype with low memory profile.
+    """
+    df["open_time"] = pd.to_datetime(df["open_time"], unit="ms")
+    df = df.set_index("open_time", drop=True)
+
+    df = df.astype(
+        dtype={
+            "open": "float32",
+            "high": "float32",
+            "low": "float32",
+            "close": "float32",
+            "volume": "float32",
+            "number_of_trades": "uint16",
+            "quote_asset_volume": "float32",
+            "taker_buy_base_asset_volume": "float32",
+            "taker_buy_quote_asset_volume": "float32",
+        }
+    )
+
+    return df
+
+
+def assert_integrity(df):
+    """
+    Make sure no rows have empty cells or duplicate timestamps exist.
+    """
+    assert df.isna().all(axis=1).any() == False
+    assert df["open_time"].duplicated().any() == False
+
+
+def clean_raw(df, limit_to_today=True):
+    """
+    Clean a raw dataframe from duplicates, sort by timestamp, filter
+    incomplete candles, drop redundant columns, set the correct dtype and
+    (optionally) cut off the data for the current day.
+    """
+    # drop dupes
+    dupes = df["open_time"].duplicated().sum()
+    if dupes > 0:
+        df = df[df["open_time"].duplicated() == False]
+
+    # sort by timestamp, oldest first
+    df.sort_values(by=["open_time"], ascending=False)
+
+    # some candlesticks do not span a full minute
+    # these points are not reliable and thus filtered
+    df = df[~(df["open_time"] - df["close_time"] != -59999)]
+
+    # `close_time` column has become redundant now, as is the column `ignore`
+    df = df.drop(["close_time", "ignore"], axis=1)
+
+    # just a doublcheck on nans and duplicate timestamps
+    assert_integrity(df)
+
+    df = set_dtypes_compressed(df)
+
+    # give all pairs the same nice cut-off
+    if limit_to_today:
+        df = df[df.index < str(date.today())]
+
+    return df
+
+
+def write_raw_to_parquet(df, full_path, limit_to_today=True, append=False):
+    """
+    Takes raw df and writes it to a parquet file, either overwriting existing
+    data or appending to it. If the file does not exist, it is created.
+    """
+    df = clean_raw(df, limit_to_today)
+    if append:
+        try:
+            df = pd.concat([pd.read_parquet(full_path), df])
+        except OSError:
+            pass
+    df.to_parquet(full_path)
diff --git a/requirements.txt b/requirements.txt
index 04dd54d..786a64f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,8 @@
-requests
+kaggle
 pandas
+progressbar2
 pyarrow
-kaggle
+requests
+
+# dev: linting
+black
diff --git a/upload_to_kaggle.py b/upload_to_kaggle.py
new file mode 100644
index 0000000..67b4554
--- /dev/null
+++ b/upload_to_kaggle.py
@@ -0,0 +1,52 @@
+import json
+import os
+import subprocess
+from datetime import date, timedelta
+
+
+COMPRESSED_PATH = "compressed"
+METADATA = {
+    "id": "jorijnsmit/binance-full-history",
+    "title": "Binance Full History",
+    "isPrivate": False,
+    "licenses": [{"name": "other"}],
+    "keywords": ["business", "finance", "investing", "currencies and foreign exchange"],
+    "collaborators": [],
+    "data": [],
+}
+
+
+def write_metadata(n_count):
+    """
+    Write the metadata file dynamically so we can include a pair count.
+    """
+    METADATA[
+        "subtitle"
+    ] = f"1 minute candlesticks for all {n_count} cryptocurrency pairs"
+    METADATA[
+        "description"
+    ] = f"""### Introduction\n\nThis is a collection of all 1 minute candlesticks of all cryptocurrency pairs on [Binance.com](https://binance.com). All {n_count} of them are included. Both retrieval and uploading the data is fully automated—see [this GitHub repo](https://github.com/gosuto-ai/candlestick_retriever).\n\n### Content\n\nFor every trading pair, the following fields from [Binance's official API endpoint for historical candlestick data](https://github.com/binance-exchange/binance-official-api-docs/blob/master/rest-api.md#klinecandlestick-data) are saved into a Parquet file:\n\n```\n #   Column                        Dtype         \n---  ------                        -----         \n 0   open_time                     datetime64[ns]\n 1   open                          float32       \n 2   high                          float32       \n 3   low                           float32       \n 4   close                         float32       \n 5   volume                        float32       \n 6   quote_asset_volume            float32       \n 7   number_of_trades              uint16        \n 8   taker_buy_base_asset_volume   float32       \n 9   taker_buy_quote_asset_volume  float32       \ndtypes: datetime64[ns](1), float32(8), uint16(1)\n```\n\nThe dataframe is indexed by `open_time` and sorted from oldest to newest. The first row starts at the first timestamp available on the exchange, which is July 2017 for the longest running pairs.\n\nHere are two simple plots based on a single file; one of the opening price with an added indicator (MA50) and one of the volume and number of trades:\n\n![](https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F2234678%2Fb8664e6f26dc84e9a40d5a3d915c9640%2Fdownload.png?generation=1582053879538546&alt=media)\n![](https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F2234678%2Fcd04ed586b08c1576a7b67d163ad9889%2Fdownload-1.png?generation=1582053899082078&alt=media)\n\n### Inspiration\n\nOne obvious use-case for this data could be technical analysis by adding indicators such as moving averages, MACD, RSI, etc. Other approaches could include backtesting trading algorithms or computing arbitrage potential with other exchanges.\n\n### License\n\nThis data is being collected automatically from crypto exchange Binance."""
+
+    with open(f"{COMPRESSED_PATH}/dataset-metadata.json", "w") as file:
+        json.dump(METADATA, file, indent=4)
+
+
+# clean the data folder and upload a new version of the dataset to kaggle
+try:
+    os.remove(f"{COMPRESSED_PATH}/.DS_Store")
+except FileNotFoundError:
+    pass
+write_metadata(n_count)
+yesterday = date.today() - timedelta(days=1)
+subprocess.run(
+    [
+        "kaggle",
+        "datasets",
+        "version",
+        "-p",
+        f"{COMPRESSED_PATH}/",
+        "-m",
+        f"full update of all {n_count} pairs up to {str(yesterday)}",
+    ]
+)
+os.remove(f"{COMPRESSED_PATH}/dataset-metadata.json")