|
5 | 5 | Inspired by the Tufts University COMP 11 Final project "trigrams", Fall 2015
|
6 | 6 |
|
7 | 7 | TODO:
|
8 |
| - - Remove details from description (a help message should be concise); |
9 |
| - add details to README |
10 | 8 | - Allow wildcards and such in filenames. See expanduser(), expandvars(), glob
|
11 | 9 | - Add caching for speedup (either single files, by language, or by directory)
|
12 | 10 | - Need to figure out when to use cache, when to update
|
|
15 | 13 | - Gotta make sure to minimize risk of accidental duplication of data,
|
16 | 14 | ie having a file's data in cache and then reading it again in
|
17 | 15 | addition to that
|
| 16 | + - Could keep filenames/last modified date to see if it's up to date |
18 | 17 | - Would need a hard refresh option, probably one for individual
|
19 | 18 | languages, and one to refresh all languages
|
20 | 19 | - Add option for directory traversal
|
| 20 | + - Implement prediction -- language name, optional "seed", num letters to |
| 21 | + predict, choose randomized or max likelihood (both using randomization |
| 22 | + for tiebreakers) |
21 | 23 | """
|
22 | 24 | import sys
|
23 | 25 | import os
|
|
26 | 28 | import language_match
|
27 | 29 |
|
28 | 30 |
|
29 |
| -# Make more concise, add details to README |
30 |
| -DESCRIPTION = ("Compares documents written in unknown languages to known languages. " + |
31 |
| -"An input file must be provided with known languages, with lines of the form" + |
32 |
| -""" |
33 |
| - English english1.txt english2.txt |
34 |
| - French french/ |
35 |
| - English english3.txt |
36 |
| - Unknown tbd.txt |
37 |
| -""" + |
38 |
| -"Where each line has a language name and the name of 1 or more files written in that language, " + |
39 |
| -"or the name of a directory containing files in that language. " + |
40 |
| -"If the language name is the 'Unknown' keyword, the language will be classified.") |
| 31 | +DESCRIPTION = ("Compares documents written in unknown languages to known languages.") |
41 | 32 |
|
42 | 33 |
|
43 | 34 | parser = argparse.ArgumentParser(description=DESCRIPTION,
|
|
54 | 45 | parser.add_argument("--unknown", "-u",
|
55 | 46 | help="the keyword designating unknown languages in input file " +
|
56 | 47 | "(default '%(default)s')")
|
57 |
| -parser.add_argument("--data", "-d", default=None, |
| 48 | +parser.add_argument("--data", "-d", default=None, # Just a flag? Make hidden files for langs? |
58 | 49 | help="file to use as cache for languages") #read and write to
|
59 | 50 |
|
60 | 51 | parser.set_defaults(n_gram_max=3,
|
|
0 commit comments