Skip to content

Commit 67ff6e3

Browse files
committed
Initial commit
1 parent a27adf3 commit 67ff6e3

22 files changed

+7060
-0
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,4 @@ dmypy.json
127127

128128
# Pyre type checker
129129
.pyre/
130+
.vscode/

.travis.yml

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
dist: bionic
2+
language: python
3+
python:
4+
- "3.6"
5+
install:
6+
- pip install .
7+
script: ./continious-integration.sh

README.md

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# GuesslangTools [![Build Status](https://travis-ci.org/yoeo/guesslangtools.svg?branch=master)](https://travis-ci.org/yoeo/guesslangtools)
2+
3+
4+
![Guesslangtools](guesslangtools/data/guesslangtools.png)
5+
6+
A training dataset generator for Guesslang's deep learning model.
7+
8+
## Description
9+
10+
GuesslangTools purpose is to find and download **a million** source code files.
11+
These files are used to train, evaluate and test
12+
[Guesslang](https://github.com/yoeo/guesslang),
13+
a deep learning programming language detection tool.
14+
15+
The files are retrieved from more than **100k** public open source
16+
GitHub repositories.
17+
18+
### Workflow
19+
20+
The million source code files used to feed Guesslang are generated as follows:
21+
22+
1. Download Github open source repositories information from the
23+
[Libraries.io Open Source Repository and Dependency Metadata](https://zenodo.org/record/1196312/files/Libraries.io-open-data-1.2.0.tar.gz).
24+
2. Randomly select the repositories that will be used to create
25+
Guesslang's training, validation and test datasets.
26+
3. Download each selected repository.
27+
4. Extract some source code files from the downloaded repositories.
28+
29+
This workflow is fully automated but takes several hours to complete,
30+
especially the download part.
31+
Fortunately, it can be stopped and resumed at any moment.
32+
33+
### Constraints
34+
35+
GuesslangTools ensures that:
36+
37+
* Each source code file in the datasets is unique.
38+
* There are no empty files.
39+
* Only text files are retrieved, binary files are skipped.
40+
* All the files are converted to UTF-8 encoding.
41+
* Each selected repository is associated to only one dataset
42+
(training, validation or test),
43+
therefore files from a training repository can only be in
44+
the training dataset. Same for the validation and test datasets.
45+
46+
## Usage
47+
48+
### Prerequisite
49+
50+
* GuesslangTools requires Python 3.6 or later.
51+
* At least 16GB of total system memory is recommended.
52+
* At least 150GB of free storage space is recommended.
53+
54+
### Installation
55+
56+
You can install GuesslangTools from the source code by running:
57+
58+
```bash
59+
pip install .
60+
```
61+
62+
### Execution
63+
64+
You can run Guesslang tools on a terminal as follows:
65+
66+
```bash
67+
gltool /path/to/source_files_destination
68+
```
69+
70+
Several options and hacks are available to fine tune the size and
71+
the diversity of the generated datasets. To list all the options, please run:
72+
73+
```bash
74+
gltool --help
75+
```
76+
77+
## License and credits
78+
79+
* [Guesslang documentation](https://guesslang.readthedocs.io/en/latest/)
80+
81+
* [Guesslang on Github](https://github.com/yoeo/guesslang)
82+
83+
* Guesslang icon created with
84+
[AndroidAssetStudio](https://github.com/romannurik/AndroidAssetStudio)
85+
86+
* Repository dataset downloaded from
87+
[Libraries.io Open Source Repository and Dependency Metadata](https://zenodo.org/record/1196312/files/Libraries.io-open-data-1.2.0.tar.gz)
88+
89+
* SQL repositories dataset retrieve from [The Public Git Archive](https://github.com/src-d/datasets/tree/master/PublicGitArchive)
90+
91+
* GuesslangTools — Copyright (c) 2020 Y. SOMDA, [MIT License](LICENSE)

continious-integration.sh

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#/bin/bash
2+
3+
set -ex
4+
5+
# install CI dependencies
6+
pip install -r requirements-dev.txt
7+
8+
# run tests
9+
python setup.py test
10+
11+
# check static types
12+
mypy --strict --ignore-missing-imports .
13+
14+
# check code quality
15+
flake8 .

guesslangtools/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__version__ = '0.1'

guesslangtools/__main__.py

+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env python3
2+
3+
from argparse import ArgumentParser, Namespace
4+
from contextlib import suppress
5+
import logging.config
6+
from typing import Dict, Any
7+
8+
from guesslangtools import hacks
9+
from guesslangtools.common import Config
10+
from guesslangtools.app import run_workflow
11+
12+
13+
LOGGING_CONFIG: Dict[str, Any] = {
14+
'version': 1,
15+
'disable_existing_loggers': False,
16+
'formatters': {
17+
'simple': {
18+
'format': '%(asctime)s %(levelname)s: %(message)s',
19+
'datefmt': '%H:%M:%S',
20+
}
21+
},
22+
'handlers': {
23+
'console': {
24+
'class': 'logging.StreamHandler',
25+
'level': 'DEBUG',
26+
'formatter': 'simple',
27+
}
28+
},
29+
'root': {
30+
'level': 'DEBUG',
31+
'handlers': ['console'],
32+
},
33+
}
34+
35+
LOGGER = logging.getLogger(__name__)
36+
37+
38+
def main() -> None:
39+
parser = ArgumentParser(description='Guesslang data preparation tool')
40+
parser.add_argument(
41+
'-d', '--debug', action='store_true',
42+
help='display debug messages')
43+
44+
parser.add_argument(
45+
'CACHE_DIR',
46+
help='directory where the generated content will be stored')
47+
parser.add_argument(
48+
'--nb-train-files', type=int, default=27000,
49+
help='number of training files per language')
50+
parser.add_argument(
51+
'--nb-valid-files', type=int, default=4000,
52+
help='number of validation files per language')
53+
parser.add_argument(
54+
'--nb-test-files', type=int, default=4000,
55+
help='number of testing files per language')
56+
parser.add_argument(
57+
'--nb-repo', type=int, default=4000,
58+
help='number of repositories per language')
59+
60+
parser.add_argument(
61+
'--hack-repo-dist', action='store_true', default=False,
62+
help='show the number of selected repositories per languages')
63+
parser.add_argument(
64+
'--hack-add-repo', nargs='+', metavar='LANGUAGE',
65+
help='select more repositories for the listed languages')
66+
parser.add_argument(
67+
'--hack-only-downloaded-repo', action='store_true', default=False,
68+
help='only use the repositories that have already been downloaded')
69+
70+
args = parser.parse_args()
71+
items = vars(args).items()
72+
hack_args = any(val for name, val in items if name.startswith('hack_'))
73+
74+
log_level = 'DEBUG' if args.debug else 'INFO'
75+
LOGGING_CONFIG['root']['level'] = log_level
76+
logging.config.dictConfig(LOGGING_CONFIG)
77+
78+
Config.setup(
79+
cache_dir=args.CACHE_DIR,
80+
nb_repositories=args.nb_repo,
81+
nb_train=args.nb_train_files,
82+
nb_valid=args.nb_valid_files,
83+
nb_test=args.nb_test_files,
84+
)
85+
86+
with suppress(KeyboardInterrupt):
87+
if hack_args:
88+
run_hacks(args)
89+
else:
90+
run_workflow()
91+
92+
93+
def run_hacks(args: Namespace) -> None:
94+
if args.hack_repo_dist:
95+
hacks.show_repositories_distribution()
96+
97+
if args.hack_add_repo:
98+
hacks.select_more_repositories(args.hack_add_repo)
99+
100+
if args.hack_only_downloaded_repo:
101+
hacks.select_only_downloaded_repo()
102+
103+
104+
if __name__ == '__main__':
105+
main()

guesslangtools/app.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from guesslangtools.workflow import repositories_dataset
2+
from guesslangtools.workflow import compressed_repositories
3+
from guesslangtools.workflow import source_files
4+
5+
6+
def run_workflow() -> None:
7+
repositories_dataset.download()
8+
repositories_dataset.extract()
9+
repositories_dataset.shrink()
10+
repositories_dataset.alter()
11+
12+
compressed_repositories.select()
13+
compressed_repositories.prepare()
14+
compressed_repositories.download()
15+
16+
source_files.list_all()
17+
source_files.split()
18+
source_files.extract()

0 commit comments

Comments
 (0)