Skip to content

Commit 978564a

Browse files
ASvyatkovskiyurialon
authored andcommitted
Duplicate path contexts in C# extractor (#7)
* Add sampling in GetInternalPaths * Add uniqueness check in Variables * Use StreamWriter instead of standard out * Add MaxContexts and output file parameters, add Reservoir sampling utility function * Flush stream to write last line * Pass ofile_name command line option to the python script * Change variable names to follow convention used * Specify ofile_name argument instead of stdout * Use a file pointed to by ofile_name rather than directing from stdout * Revert change in the Pool size * Use IDisposable to manage StreamWriter
1 parent 88011ab commit 978564a

File tree

6 files changed

+118
-62
lines changed

6 files changed

+118
-62
lines changed

CSharpExtractor/CSharpExtractor/Extractor/Extractor.cs

+36-26
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
using System.Collections.Generic;
77
using System.Linq;
88
using System.Text;
9+
using System.Diagnostics;
10+
911

1012
namespace Extractor
1113
{
@@ -27,12 +29,14 @@ public class Extractor
2729
public int WidthLimit { get; set; }
2830
public string Code { get; set; }
2931
public bool ShouldHash { get; set; }
32+
public int MaxContexts { get; set; }
3033

31-
public Extractor(string code, Options opts)
34+
public Extractor(string code, Options opts)
3235
{
3336
LengthLimit = opts.MaxLength;
3437
WidthLimit = opts.MaxWidth;
3538
ShouldHash = !opts.NoHash;
39+
MaxContexts = opts.MaxContexts;
3640
Code = code;
3741
}
3842

@@ -104,29 +108,32 @@ private string PathToString(PathFinder.Path path)
104108
return builder.ToString();
105109
}
106110

107-
internal IEnumerable<PathFinder.Path> GetInternalPaths(Tree tree)
108-
{
111+
internal IEnumerable<PathFinder.Path> GetInternalPaths(Tree tree)
112+
{
109113
var finder = new PathFinder(tree, LengthLimit, WidthLimit);
110-
foreach (Tuple<Variable, Variable> varPair in
111-
Utilities.WeakConcat(Utilities.Choose2(variables),
112-
variables.Select((arg) => new Tuple<Variable,Variable>(arg,arg))))
113-
{
114-
bool pathToSelf = varPair.Item1 == varPair.Item2;
115-
116-
foreach(var lhs in varPair.Item1.Leaves)
117-
foreach (var rhs in varPair.Item2.Leaves)
118-
{
119-
if (lhs == rhs)
120-
continue;
121-
122-
PathFinder.Path path = finder.FindPath(lhs, rhs, limited: true);
123-
124-
if (path == null)
125-
continue;
126-
127-
128-
yield return path;
129-
}
114+
115+
var allPairs = Utilities.ReservoirSample(Utilities.WeakConcat(Utilities.Choose2(variables),
116+
variables.Select((arg) => new Tuple<Variable, Variable>(arg, arg))), MaxContexts);
117+
118+
//iterate over variable-variable pairs
119+
foreach (Tuple<Variable, Variable> varPair in allPairs)
120+
{
121+
bool pathToSelf = varPair.Item1 == varPair.Item2;
122+
123+
foreach (var rhs in varPair.Item2.Leaves)
124+
foreach (var lhs in varPair.Item1.Leaves)
125+
{
126+
127+
if (lhs == rhs)
128+
continue;
129+
130+
PathFinder.Path path = finder.FindPath(lhs, rhs, limited: true);
131+
132+
if (path == null)
133+
continue;
134+
135+
yield return path;
136+
}
130137
}
131138
}
132139

@@ -167,6 +174,7 @@ public List<String> Extract()
167174
List<String> results = new List<string>();
168175

169176
foreach(var method in methods) {
177+
170178
String methodName = method.Identifier.ValueText;
171179
Tree methodTree = new Tree(method);
172180
var subtokensMethodName = Utilities.SplitToSubtokens(methodName);
@@ -185,10 +193,12 @@ public List<String> Extract()
185193

186194
foreach (PathFinder.Path path in GetInternalPaths(methodTree))
187195
{
188-
contexts.Add(SplitNameUnlessEmpty(tokenToVar[path.Left].Name)
196+
String pathString = SplitNameUnlessEmpty(tokenToVar[path.Left].Name)
189197
+ "," + MaybeHash(this.PathNodesToString(path))
190-
+ "," + SplitNameUnlessEmpty(tokenToVar[path.Right].Name));
198+
+ "," + SplitNameUnlessEmpty(tokenToVar[path.Right].Name);
191199

200+
Debug.WriteLine(path.Left.FullSpan+" "+tokenToVar[path.Left].Name+ "," +this.PathNodesToString(path)+ "," + tokenToVar[path.Right].Name+" "+path.Right.FullSpan);
201+
contexts.Add(pathString);
192202
}
193203

194204
var commentNodes = tree.GetRoot().DescendantTrivia().Where(
@@ -206,7 +216,7 @@ public List<String> Extract()
206216
contexts.Add(batch + "," + "COMMENT" + "," + batch);
207217
}
208218
}
209-
results.Add(String.Join("|", subtokensMethodName) + " " + String.Join(" ", contexts));
219+
results.Add(String.Join("|", subtokensMethodName) + " " + String.Join(" ", contexts));
210220
}
211221
return results;
212222
}

CSharpExtractor/CSharpExtractor/Extractor/Program.cs

+5-2
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,12 @@ static void Main(string[] args)
4444

4545
results = files.AsParallel().WithDegreeOfParallelism(options.Threads).SelectMany(filename => ExtractSingleFile(filename, options));
4646

47-
foreach (var res in results)
47+
using (StreamWriter sw = new StreamWriter(options.OFileName, append: true))
4848
{
49-
Console.WriteLine(res);
49+
foreach (var res in results)
50+
{
51+
sw.WriteLine(res);
52+
}
5053
}
5154
}
5255
}

CSharpExtractor/CSharpExtractor/Extractor/Utilities.cs

+43-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
using System.Collections.Generic;
44
using System.Linq;
55
using System.Text;
6+
using System.Diagnostics;
67
using System.Text.RegularExpressions;
78

89
namespace Extractor
@@ -21,11 +22,17 @@ public class Options
2122
[Option('l', "max_width", Default = 2, HelpText = "Max path length")]
2223
public int MaxWidth { get; set; }
2324

25+
[Option('o', "ofile_name", Default = "test.txt", HelpText = "Output file name")]
26+
public String OFileName { get; set; }
27+
2428
[Option('h', "no_hash", Default = false, HelpText = "When enabled, prints the whole path strings (not hashed)")]
2529
public Boolean NoHash { get; set; }
30+
31+
[Option('l', "max_contexts", Default = 30000, HelpText = "Max number of path contexts to sample. Affects only very large snippets")]
32+
public int MaxContexts { get; set; }
2633
}
2734

28-
public class Utilities
35+
public static class Utilities
2936
{
3037
public static String[] NumbericLiteralsToKeep = new String[] { "0", "1", "2", "3", "4", "5", "10" };
3138
public static IEnumerable<Tuple<T, T>> Choose2<T>(IEnumerable<T> enumerable)
@@ -40,7 +47,41 @@ public static IEnumerable<Tuple<T, T>> Choose2<T>(IEnumerable<T> enumerable)
4047
}
4148
}
4249

43-
public static IEnumerable<T> WeakConcat<T>(IEnumerable<T> enumerable1, IEnumerable<T> enumerable2)
50+
/// <summary>
51+
/// Sample uniform randomly numSamples from an enumerable, using reservoir sampling.
52+
/// See https://en.wikipedia.org/wiki/Reservoir_sampling
53+
/// </summary>
54+
/// <typeparam name="T"></typeparam>
55+
/// <param name="input"></param>
56+
/// <param name="numSamples"></param>
57+
/// <returns></returns>
58+
public static IEnumerable<TSource> ReservoirSample<TSource>(this IEnumerable<TSource> input, int numSamples)
59+
{
60+
var rng = new Random();
61+
var sampledElements = new List<TSource>(numSamples);
62+
int seenElementCount = 0;
63+
foreach (var element in input)
64+
{
65+
seenElementCount++;
66+
if (sampledElements.Count < numSamples)
67+
{
68+
sampledElements.Add(element);
69+
}
70+
else
71+
{
72+
int position = rng.Next(seenElementCount);
73+
if (position < numSamples)
74+
{
75+
sampledElements[position] = element;
76+
}
77+
}
78+
}
79+
Debug.Assert(sampledElements.Count <= numSamples);
80+
return sampledElements;
81+
}
82+
83+
84+
public static IEnumerable<T> WeakConcat<T>(IEnumerable<T> enumerable1, IEnumerable<T> enumerable2)
4485
{
4586
foreach (T t in enumerable1)
4687
yield return t;

CSharpExtractor/CSharpExtractor/Extractor/Variable.cs

+9-2
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,15 @@ internal static IEnumerable<Variable> CreateFromMethod(Tree methodTree)
9494
string name = tokenToName[leaf];
9595
SyntaxToken[] syntaxTokens = nameToTokens[name].ToArray();
9696
var v = new Variable(name, syntaxTokens, methodTree);
97-
results.Add(v);
98-
}
97+
98+
//check if exists
99+
var matches = results.Where(p => p.Name == name).ToList();
100+
bool alreadyExists = (matches.Count != 0);
101+
if (!alreadyExists)
102+
{
103+
results.Add(v);
104+
}
105+
}
99106

100107
return results;
101108
}

CSharpExtractor/extract.py

+22-27
Original file line numberDiff line numberDiff line change
@@ -27,35 +27,30 @@ def ParallelExtractDir(args, dir):
2727
def ExtractFeaturesForDir(args, dir, prefix):
2828
command = ['dotnet', 'run', '--project', args.csproj,
2929
'--max_length', str(args.max_path_length), '--max_width', str(args.max_path_width),
30-
'--path', dir, '--threads', str(args.num_threads)]
30+
'--path', dir, '--threads', str(args.num_threads), '--ofile_name', str(args.ofile_name)]
31+
3132

3233
# print command
3334
# os.system(command)
3435
kill = lambda process: process.kill()
35-
outputFileName = TMP_DIR + prefix + dir.split('/')[-1]
36-
failed = False
37-
with open(outputFileName, 'a') as outputFile:
38-
sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE)
39-
timer = Timer(600000, kill, [sleeper])
40-
41-
try:
42-
timer.start()
43-
stdout, stderr = sleeper.communicate()
44-
finally:
45-
timer.cancel()
46-
47-
if sleeper.poll() == 0:
48-
if len(stderr) > 0:
49-
print(sys.stderr, stderr, file=sys.stdout)
50-
else:
51-
print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time', file=sys.stdout)
52-
failed = True
53-
subdirs = get_immediate_subdirectories(dir)
54-
for subdir in subdirs:
55-
ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_')
56-
if failed:
57-
if os.path.exists(outputFileName):
58-
os.remove(outputFileName)
36+
sleeper = subprocess.Popen(command, stderr=subprocess.PIPE)
37+
timer = Timer(600000, kill, [sleeper])
38+
39+
try:
40+
timer.start()
41+
_, stderr = sleeper.communicate()
42+
finally:
43+
timer.cancel()
44+
45+
if sleeper.poll() == 0:
46+
if len(stderr) > 0:
47+
print(sys.stderr, stderr)
48+
else:
49+
print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time')
50+
failed = True
51+
subdirs = get_immediate_subdirectories(dir)
52+
for subdir in subdirs:
53+
ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_')
5954

6055

6156
def ExtractFeaturesForDirsList(args, dirs):
@@ -77,12 +72,14 @@ def ExtractFeaturesForDirsList(args, dirs):
7772

7873

7974
if __name__ == '__main__':
75+
8076
parser = ArgumentParser()
8177
parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8)
8278
parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2)
8379
parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64)
8480
parser.add_argument("--csproj", dest="csproj", required=True)
8581
parser.add_argument("-dir", "--dir", dest="dir", required=False)
82+
parser.add_argument("-ofile_name", "--ofile_name", dest="ofile_name", required=True)
8683
args = parser.parse_args()
8784

8885
if args.dir is not None:
@@ -91,5 +88,3 @@ def ExtractFeaturesForDirsList(args, dirs):
9188
if len(subdirs) == 0:
9289
to_extract = [args.dir.rstrip('/')]
9390
ExtractFeaturesForDirsList(args, to_extract)
94-
95-

preprocess_csharp.sh

+3-3
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,13 @@ mkdir -p data
3939
mkdir -p data/${DATASET_NAME}
4040

4141
echo "Extracting paths from validation set..."
42-
${PYTHON} CSharpExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} > ${VAL_DATA_FILE}
42+
${PYTHON} CSharpExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${VAL_DATA_FILE}
4343
echo "Finished extracting paths from validation set"
4444
echo "Extracting paths from test set..."
45-
${PYTHON} CSharpExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} > ${TEST_DATA_FILE}
45+
${PYTHON} CSharpExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${TEST_DATA_FILE}
4646
echo "Finished extracting paths from test set"
4747
echo "Extracting paths from training set..."
48-
${PYTHON} CSharpExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} > ${TRAIN_DATA_FILE}
48+
${PYTHON} CSharpExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${TRAIN_DATA_FILE}
4949
echo "Finished extracting paths from training set"
5050

5151
TARGET_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.tgt.c2v

0 commit comments

Comments
 (0)