NextSecurity
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎.travis.yml
+7 b/‎.travis.yml
+7
diff --git a/‎README.md
+91 b/‎README.md
+91
diff --git a/‎continious-integration.sh
+15 b/‎continious-integration.sh
+15
diff --git a/‎guesslangtools/__init__.py
+1 b/‎guesslangtools/__init__.py
+1
diff --git a/‎guesslangtools/__main__.py
+105 b/‎guesslangtools/__main__.py
+105
diff --git a/‎guesslangtools/app.py
+18 b/‎guesslangtools/app.py
+18
@@ -127,3 +127,4 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+.vscode/
@@ -0,0 +1,7 @@
+dist: bionic
+language: python
+python:
+  - "3.6"
+install:
+  - pip install .
+script: ./continious-integration.sh
@@ -0,0 +1,91 @@
+# GuesslangTools [![Build Status](https://travis-ci.org/yoeo/guesslangtools.svg?branch=master)](https://travis-ci.org/yoeo/guesslangtools)
+
+
+![Guesslangtools](guesslangtools/data/guesslangtools.png)
+
+A training dataset generator for Guesslang's deep learning model.
+
+## Description
+
+GuesslangTools purpose is to find and download **a million** source code files.
+These files are used to train, evaluate and test
+[Guesslang](https://github.com/yoeo/guesslang),
+a deep learning programming language detection tool.
+
+The files are retrieved from more than **100k** public open source
+GitHub repositories.
+
+### Workflow
+
+The million source code files used to feed Guesslang are generated as follows:
+
+1. Download Github open source repositories information from the
+[Libraries.io Open Source Repository and Dependency Metadata](https://zenodo.org/record/1196312/files/Libraries.io-open-data-1.2.0.tar.gz).
+2. Randomly select the repositories that will be used to create
+  Guesslang's training, validation and test datasets.
+3. Download each selected repository.
+4. Extract some source code files from the downloaded repositories.
+
+This workflow is fully automated but takes several hours to complete,
+especially the download part.
+Fortunately, it can be stopped and resumed at any moment.
+
+### Constraints
+
+GuesslangTools ensures that:
+
+* Each source code file in the datasets is unique.
+* There are no empty files.
+* Only text files are retrieved, binary files are skipped.
+* All the files are converted to UTF-8 encoding.
+* Each selected repository is associated to only one dataset
+  (training, validation or test),
+  therefore files from a training repository can only be in
+  the training dataset. Same for the validation and test datasets.
+
+## Usage
+
+### Prerequisite
+
+* GuesslangTools requires Python 3.6 or later.
+* At least 16GB of total system memory is recommended.
+* At least 150GB of free storage space is recommended.
+
+### Installation
+
+You can install GuesslangTools from the source code by running:
+
+```bash
+pip install .
+```
+
+### Execution
+
+You can run Guesslang tools on a terminal as follows:
+
+```bash
+gltool /path/to/source_files_destination
+```
+
+Several options and hacks are available to fine tune the size and
+the diversity of the generated datasets. To list all the options, please run:
+
+```bash
+gltool --help
+```
+
+## License and credits
+
+* [Guesslang documentation](https://guesslang.readthedocs.io/en/latest/)
+
+* [Guesslang on Github](https://github.com/yoeo/guesslang)
+
+* Guesslang icon created with
+  [AndroidAssetStudio](https://github.com/romannurik/AndroidAssetStudio)
+
+* Repository dataset downloaded from
+  [Libraries.io Open Source Repository and Dependency Metadata](https://zenodo.org/record/1196312/files/Libraries.io-open-data-1.2.0.tar.gz)
+
+* SQL repositories dataset retrieve from [The Public Git Archive](https://github.com/src-d/datasets/tree/master/PublicGitArchive)
+
+* GuesslangTools — Copyright (c) 2020 Y. SOMDA, [MIT License](LICENSE)
@@ -0,0 +1,15 @@
+#/bin/bash
+
+set -ex
+
+# install CI dependencies
+pip install -r requirements-dev.txt
+
+# run tests
+python setup.py test
+
+# check static types
+mypy --strict --ignore-missing-imports .
+
+# check code quality
+flake8 .
@@ -0,0 +1 @@
+__version__ = '0.1'
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+from argparse import ArgumentParser, Namespace
+from contextlib import suppress
+import logging.config
+from typing import Dict, Any
+
+from guesslangtools import hacks
+from guesslangtools.common import Config
+from guesslangtools.app import run_workflow
+
+
+LOGGING_CONFIG: Dict[str, Any] = {
+    'version': 1,
+    'disable_existing_loggers': False,
+    'formatters': {
+        'simple': {
+            'format': '%(asctime)s %(levelname)s: %(message)s',
+            'datefmt': '%H:%M:%S',
+        }
+    },
+    'handlers': {
+        'console': {
+            'class': 'logging.StreamHandler',
+            'level': 'DEBUG',
+            'formatter': 'simple',
+        }
+    },
+    'root': {
+        'level': 'DEBUG',
+        'handlers': ['console'],
+    },
+}
+
+LOGGER = logging.getLogger(__name__)
+
+
+def main() -> None:
+    parser = ArgumentParser(description='Guesslang data preparation tool')
+    parser.add_argument(
+        '-d', '--debug', action='store_true',
+        help='display debug messages')
+
+    parser.add_argument(
+        'CACHE_DIR',
+        help='directory where the generated content will be stored')
+    parser.add_argument(
+        '--nb-train-files', type=int, default=27000,
+        help='number of training files per language')
+    parser.add_argument(
+        '--nb-valid-files', type=int, default=4000,
+        help='number of validation files per language')
+    parser.add_argument(
+        '--nb-test-files', type=int, default=4000,
+        help='number of testing files per language')
+    parser.add_argument(
+        '--nb-repo', type=int, default=4000,
+        help='number of repositories per language')
+
+    parser.add_argument(
+        '--hack-repo-dist', action='store_true', default=False,
+        help='show the number of selected repositories per languages')
+    parser.add_argument(
+        '--hack-add-repo', nargs='+', metavar='LANGUAGE',
+        help='select more repositories for the listed languages')
+    parser.add_argument(
+        '--hack-only-downloaded-repo', action='store_true', default=False,
+        help='only use the repositories that have already been downloaded')
+
+    args = parser.parse_args()
+    items = vars(args).items()
+    hack_args = any(val for name, val in items if name.startswith('hack_'))
+
+    log_level = 'DEBUG' if args.debug else 'INFO'
+    LOGGING_CONFIG['root']['level'] = log_level
+    logging.config.dictConfig(LOGGING_CONFIG)
+
+    Config.setup(
+        cache_dir=args.CACHE_DIR,
+        nb_repositories=args.nb_repo,
+        nb_train=args.nb_train_files,
+        nb_valid=args.nb_valid_files,
+        nb_test=args.nb_test_files,
+    )
+
+    with suppress(KeyboardInterrupt):
+        if hack_args:
+            run_hacks(args)
+        else:
+            run_workflow()
+
+
+def run_hacks(args: Namespace) -> None:
+    if args.hack_repo_dist:
+        hacks.show_repositories_distribution()
+
+    if args.hack_add_repo:
+        hacks.select_more_repositories(args.hack_add_repo)
+
+    if args.hack_only_downloaded_repo:
+        hacks.select_only_downloaded_repo()
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,18 @@
+from guesslangtools.workflow import repositories_dataset
+from guesslangtools.workflow import compressed_repositories
+from guesslangtools.workflow import source_files
+
+
+def run_workflow() -> None:
+    repositories_dataset.download()
+    repositories_dataset.extract()
+    repositories_dataset.shrink()
+    repositories_dataset.alter()
+
+    compressed_repositories.select()
+    compressed_repositories.prepare()
+    compressed_repositories.download()
+
+    source_files.list_all()
+    source_files.split()
+    source_files.extract()
Original file line number	Diff line number	Diff line change
`@@ -127,3 +127,4 @@ dmypy.json`
`127`	`127`
`128`	`128`	`# Pyre type checker`
`129`	`129`	`.pyre/`
	`130`	`+.vscode/`