-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCodeTokenizer.py
executable file
·209 lines (177 loc) · 4.24 KB
/
CodeTokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/bin/python
import sys
from ply import lex
tokens = [
"IDENTIFIER",
"MACRO",
"WHITESPACE",
"COMMENT",
"MCOMMENT",
"INCLUDE",
"NUMBER",
#"LITERAL",
"LPARENTHESIS",
"RPARENTHESIS",
"LBRACKET",
"RBRACKET",
"LBRACE",
"RBRACE",
"SEMICOLON",
"COMMA",
"ASTERISK",
"EQUAL",
"ASSIGN",
"DIVIDE",
"NOT",
"MINUSLESS",
"STRING",
"CHARACTER",
"LESS",
"LEQ",
"GREATER",
"GEQ",
"NEQ",
"OR",
"AND",
"PLUS",
"MINUS",
"QUESTION",
"COLON",
"BAND",
"BOR",
"BACKSLASH",
"DOT"
]
t_BACKSLASH = (r"[\\]")
t_STRING = (r"\".*?\"")
t_CHARACTER = (r"'.*?'")
t_MINUSLESS = ("->")
t_QUESTION = (r"\?")
t_COLON = (":")
t_DOT = (".")
# unary operators
# binary operators
t_DIVIDE = ("/")
t_MINUS = ("-")
t_PLUS = (r"\+")
# bit operatos
t_BOR = (r"\|")
t_BAND = ("&")
# logical operatos
t_AND = ("&&")
t_OR = ("\|\|")
t_NOT = ("!")
# relational operators
t_GEQ = (">=")
t_LEQ = ("<=")
t_GREATER = (">")
t_LESS = ("<")
t_EQUAL = ("==")
# assigment
t_ASSIGN = ("=")
#C identifier
t_IDENTIFIER = (r"[a-zA-Z_]\w*")
#C macro
t_MACRO = (r"\#[a-zA-Z_]\w+")
#C whitespaces
t_WHITESPACE = (r"\s+")
# one line and multi-line comment
t_COMMENT = (
r"//[^\n]*"
)
t_MCOMMENT = (
r"/\*(\n|.)*?\*/"
)
# include <|"file.h"|>
t_INCLUDE = (
r"\#include\s+<.+?>|"
r"\#include\s+\".+?\""
)
# number
t_NUMBER = (r"(\+|-)?\d+(\.\d+)?(e|E((\+|-)?\d+))?")
# literal
# left,right parenthesis
t_LPARENTHESIS = (r"\(")
t_RPARENTHESIS = (r"\)")
# left, right bracket
t_LBRACKET = (r"\[")
t_RBRACKET = (r"\]")
# left, right brace
t_LBRACE = (r"\{")
t_RBRACE = (r"\}")
# semicolon, comma, asterisk
t_SEMICOLON = (r";")
t_COMMA = (r",")
t_ASTERISK = (r"\*")
def t_error(t):
raise TypeError("Unknown text '%s'" % (t.value[0:20],))
################################################
source_code_keywords = {}
source_code_comments = []
def getCodeKeywordsOccurences(file):
lex.lex()
with open(file, "r") as fd:
content = fd.read()
lex.input(content)
line_number = 1
column_number = 1
for tok in iter(lex.token, None):
#print tok
#print repr(tok.type), repr(tok.value), line_number, column_number
# for each token compute next column_number (not for actual token but for the next)
if tok.type == 'COMMENT':
#print line_number, tok.value
mcomment = []
mcomment.append( (line_number, column_number) )
#print (line_number, column_number)
#print repr(tok.value)[1:-1]
#print (line_number, column_number + len(repr(tok.value)[1:-1]) - 1)
mcomment.append( (line_number, column_number + len(repr(tok.value)[1:-1]) - 1) )
#print mcomment
source_code_comments.append( mcomment )
column_number = 1
elif tok.type == 'WHITESPACE' or tok.type == 'MCOMMENT':
#if tok.type == 'MCOMMENT':
# print line_number
mcomment = []
if tok.type == 'MCOMMENT':
#print "cs: (%d, %d)" % (line_number, column_number)
mcomment.append( (line_number, column_number) )
line_number = line_number + tok.value.count('\n')
# find the last \n character
lnl = tok.value.rfind('\n')
ll = len(tok.value)
#print(ll, lnl + 1)
if ll == (lnl + 1):
column_number = 1
else:
if lnl != -1:
column_number = ll - (lnl + 1) + 1
else:
column_number = column_number + ll
if tok.type == 'MCOMMENT':
#print "ce: (%d, %d)" % (line_number, column_number)
mcomment.append( (line_number, column_number) )
source_code_comments.append(mcomment)
#print mcomment
else:
#print (line_number, tok.value)
# save only identifiers
if tok.type == 'IDENTIFIER':
# filter out all language keywords
# save into db keyword and its line number
key = repr(tok.value)
key = key[1:-1] # get rid of ' char at the beggining and end
#print (key, line_number, column_number)
if key not in source_code_keywords:
source_code_keywords[key] = {line_number: [column_number]}
if line_number not in source_code_keywords[key]:
source_code_keywords[key][line_number] = [column_number]
elif column_number not in source_code_keywords[key][line_number]:
source_code_keywords[key][line_number].append(column_number)
column_number = column_number + len(tok.value)
#for key in source_code_keywords:
# print "%s (%s)" % (key, str(source_code_keywords[key]))
return (source_code_keywords, source_code_comments)
if __name__ == "__main__":
print getCodeKeywordsOccurences(sys.argv[1])