Skip to content

Commit b171535

Browse files
Merge pull request #22 from christianversloot/dataset-release
Dataset release
2 parents 9f705a4 + d0ba71a commit b171535

File tree

4 files changed

+123
-1
lines changed

4 files changed

+123
-1
lines changed

README.md

+18
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ _The names TensorFlow, Keras, as well as related names, marks, emblems and image
3030
* [STL-10](#stl-10)
3131
* [Iris](#iris)
3232
* [Wine Quality dataset](#wine-quality-dataset)
33+
* [USPS Handwritten Digits Dataset](#usps-handwritten-digits-dataset)
3334
- [Contributors and other references](#contributors-and-other-references)
3435
- [License](#license)
3536

@@ -206,6 +207,21 @@ from extra_keras_datasets import wine_quality
206207

207208
---
208209

210+
### USPS Handwritten Digits Dataset
211+
This dataset presents thousands of 16x16 grayscale images of handwritten digits, generated from real USPS based mail.
212+
213+
* Input structure: 16x16 image
214+
* Target structure: digit ranging from 0.0 - 9.0 describing the input
215+
216+
```
217+
from extra_keras_datasets import usps
218+
(input_train, target_train), (input_test, target_test) = usps.load_data()
219+
```
220+
221+
<a href="./assets/usps.png"><img src="./assets/usps.png" width="100%" style="border: 3px solid #f6f8fa;" /></a>
222+
223+
---
224+
209225
## Contributors and other references
210226
* **EMNIST dataset:**
211227
* Cohen, G., Afshar, S., Tapson, J., & van Schaik, A. (2017). EMNIST: an extension of MNIST to handwritten letters. Retrieved from http://arxiv.org/abs/1702.05373
@@ -220,6 +236,8 @@ from extra_keras_datasets import wine_quality
220236
* Fisher,R.A. "The use of multiple measurements in taxonomic problems" Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to Mathematical Statistics" (John Wiley, NY, 1950).
221237
* **Wine Quality dataset:**
222238
* P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
239+
* **USPS Handwritten Digits Dataset**
240+
* Hull, J. J. (1994). A database for handwritten text recognition research. IEEE Transactions on pattern analysis and machine intelligence, 16(5), 550-554.
223241

224242
## License
225243
The licenseable parts of this repository are licensed under a [MIT License](./LICENSE), so you're free to use this repo in your machine learning projects / blogs / exercises, and so on. Happy engineering! 🚀

assets/usps.png

5.99 KB
Loading

extra_keras_datasets/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,6 @@
88
from . import stl10
99
from . import iris
1010
from . import wine_quality
11+
from . import usps
1112

12-
__all__ = ['emnist', 'kmnist', 'svhn', 'stl10', 'iris', 'wine_quality']
13+
__all__ = ['emnist', 'kmnist', 'svhn', 'stl10', 'iris', 'wine_quality', 'usps']

extra_keras_datasets/usps.py

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
"""
2+
Import the USPS Handwritten Digits Dataset
3+
Source: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
4+
multiclass.html#usps
5+
(and: https://ieeexplore.ieee.org/document/291440)
6+
Description: Handwritten text recognition image database.
7+
8+
~~~ Important note ~~~
9+
Please cite the following paper when using or referencing the dataset:
10+
Hull, J. J. (1994). A database for handwritten text recognition
11+
research. IEEE Transactions on pattern analysis and machine
12+
intelligence, 16(5), 550-554.
13+
"""
14+
15+
from tensorflow.keras.utils import get_file
16+
import logging
17+
from sklearn.datasets import load_svmlight_file
18+
import bz2
19+
20+
21+
def warn_citation():
22+
"""Warns about citation requirements
23+
# Returns
24+
Void
25+
"""
26+
logging.warning(("Please cite the following paper when using or"
27+
" referencing this Extra Keras Dataset:"))
28+
logging.warning(
29+
("Hull, J. J. (1994). A database for handwritten text "
30+
"recognition research. IEEE Transactions on pattern analysis and "
31+
"machine intelligence, 16(5), 550-554.")
32+
)
33+
34+
35+
def decompress(path):
36+
"""Decompresses BZ2 data into another file"""
37+
bz_zip = bz2.BZ2File(path)
38+
decompressed_data = bz_zip.read()
39+
new_path = path[:-4]
40+
open(new_path, 'wb').write(decompressed_data)
41+
return new_path
42+
43+
44+
def load_to_numpy(path):
45+
"""Loads LIBSVM data into NumPY format"""
46+
data = load_svmlight_file(path)
47+
return (data[0].toarray(), data[1])
48+
49+
50+
def load_data(
51+
path="usps.bz2",
52+
path_testing="usps-testing.bz2"
53+
):
54+
"""Loads the USPS Handwritten Digits Dataset.
55+
# Arguments
56+
path: path where to cache the USPS data locally
57+
(relative to ~/.keras/datasets).
58+
path_testing: path where to cache the USPS testing data locally
59+
(relative to ~/.keras/datasets).
60+
# Returns
61+
Tuple of Numpy arrays: `(input_train, target_train),
62+
(input_test, target_test)`.
63+
Input structure: 16x16 image with a digit
64+
Target structure: number in the 0.0 - 9.0 range
65+
66+
"""
67+
# Log about loading
68+
logging.basicConfig(level=logging.INFO)
69+
logging.info('Loading dataset = usps')
70+
71+
# Download data
72+
path = get_file(
73+
path,
74+
origin=("https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/"
75+
"datasets/multiclass/usps.bz2")
76+
)
77+
path_testing = get_file(
78+
path_testing,
79+
origin=("https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/"
80+
"datasets/multiclass/usps.t.bz2")
81+
)
82+
83+
# Decompress data
84+
decompress_train = decompress(path)
85+
decompress_test = decompress(path_testing)
86+
87+
# Load LIBSVM data into NumPy array
88+
(input_train, target_train) = load_to_numpy(decompress_train)
89+
(input_test, target_test) = load_to_numpy(decompress_test)
90+
91+
# Reshape data
92+
input_train = input_train.reshape(input_train.shape[0], 16, 16)
93+
input_test = input_test.reshape(input_test.shape[0], 16, 16)
94+
95+
# Correct targets (e.g. number 3 is now returned as 4.0)
96+
target_train = target_train - 1
97+
target_test = target_test - 1
98+
99+
# Warn about citation
100+
warn_citation()
101+
102+
# Return data
103+
return (input_train, target_train), (input_test, target_test)

0 commit comments

Comments
 (0)