-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlib.py
156 lines (119 loc) · 5.84 KB
/
lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from typing import *
import re
import dataclasses
import enum
class TokenType(enum.Enum):
operator = enum.auto()
directive_begin = enum.auto()
word = enum.auto()
error = enum.auto()
@dataclasses.dataclass
class Token:
lexeme: str = ""
type: TokenType = TokenType.operator
had_space_before: bool = False
containing_directive: Optional[str] = None
previous: Optional["Token"] = None
def __repr__(self):
return (f"{self.lexeme!r} as {self.type.name}" + (f" in #{self.containing_directive}" if
self.containing_directive else "") +
f", after <{self.previous.type.name if self.previous else None}>")
def any_of(l):
return re.compile("(" + "|".join(re.escape(s) for s in l) + ")")
Re = type(re.compile(""))
def tokenize(s: str, raise_errors: bool = True):
s = "\n" + s + "\n"
s = re.sub(r"\t+", " ", s)
s = re.sub(r"\\\n", "", s, re.DOTALL)
s = re.sub(r"\n +", "\n", s)
i, n = 0, len(s)
tokens: List[Token] = []
def try_eat(e: Re, token_type: Optional[TokenType] = None,
lexeme_group=0, custom_finalizer=None):
"""Tries to eat a token matching the expression, appends it and moves cursor if successful."""
nonlocal i
m = e.search(s, i, i + 2048)
if not m or m.start(0) != i or len(m.group(0)) == 0:
return False
if token_type is not None:
tokens.append(Token(m.group(lexeme_group), token_type, s[i - 1] in " \n"))
i = m.end(0)
if custom_finalizer is not None:
custom_finalizer(m)
return True
while i < n:
# Whitespace, comments.
if try_eat(re.compile(r"[ \n]+")): continue
if try_eat(re.compile(r"//.*")): continue
if try_eat(re.compile(r"/\*.*?\*/", re.DOTALL)): continue
# # Preprocessor directives.
if s[i - 1] == "\n":
def finalize(m):
j = len(tokens) - 1
tokens.extend(tokenize(m.group(2)))
for token in tokens[j:]:
token.containing_directive = m.group(1)[1:].lstrip()
if try_eat(re.compile(r"(# *[a-zA-Z]*)(.*)"), TokenType.directive_begin, lexeme_group=1,
custom_finalizer=finalize): continue
# Generic identifiers and numbers.
if try_eat(re.compile(r"(\.\d)?(\d\.\d|\w)*"), TokenType.word): continue
# Operators.
short_operators = any_of("()[].!~*&/%<>^|?:=,;!@#$%^&*()-+=[]{};:,.<>/?~")
long_operators = any_of(["->", "++", "--", "<<", ">>", "<=", ">=", "==", "!=", "&&", "||", "+=", "-=",
"*=", "/=", "%=", "&=", "^=", "|=", "...", "##", "::"])
if try_eat(long_operators, TokenType.operator): continue # Always prefer longer operator.
if try_eat(short_operators, TokenType.operator): continue
# String and character literals.
string = r'"(\\\\|\\"|[^\n"])*"\w*'
if try_eat(re.compile(string), TokenType.word): continue
if try_eat(re.compile(string.replace('"', "'")), TokenType.word): continue
if raise_errors:
raise SyntaxError(f"Error while parsing at {i} at: {s[i:i + 64]}")
else:
try_eat(re.compile(".", re.DOTALL), TokenType.error)
tokens = set_previous_infos(tokens)
return tokens
def set_previous_infos(tokens: List[Token]):
for i, token in enumerate(tokens):
token.previous = tokens[i - 1] if i > 0 else None
return tokens
def serialize(tokens: List[Token]):
def is_after_directive(token: Token):
return token.containing_directive is None and token.previous.containing_directive is not None
def requires_newline(token: Token):
return is_after_directive(token) or token.type == TokenType.directive_begin
def requires_space(token: Token):
# This is terrible. Please send help.
return ((token.type == TokenType.word and token.previous.type != TokenType.operator) or
(token.type == TokenType.operator and token.lexeme[0] == token.previous.lexeme[-1] and
token.lexeme[0] in ("+", "-") and len(token.lexeme) >= len(token.previous.lexeme)) or
(token.previous.type == TokenType.directive_begin) or
(token.previous.previous is not None and token.previous.previous.type == TokenType.directive_begin and
token.containing_directive == "define" and token.had_space_before))
s = [tokens[0].lexeme]
for current_token in tokens[1:]:
if requires_newline(current_token):
s.append("\n")
elif requires_space(current_token):
s.append(" ")
s.append(current_token.lexeme)
return "".join(s).strip()
def rename_tokens(tokens: List[Token], operators: bool = False):
def should_be_renamed(token):
if token.type == TokenType.word:
return token.containing_directive is None
if operators and token.type == TokenType.operator and not token.containing_directive:
return True
return False
originals = sorted(list(set(token.lexeme for token in tokens if should_be_renamed(token))))
replacements = [f"var_{i}" for i, original in enumerate(originals)]
original_to_replacement = {original: replacement for original, replacement in zip(originals, replacements)}
defines = tokenize("\n".join(f"#define {renamed} {original}" for original, renamed in zip(originals, replacements)))
tokens = set_previous_infos(tokens)
for i, token in enumerate(tokens):
if should_be_renamed(token):
tokens[i] = Token(original_to_replacement[token.lexeme], TokenType.word, token.had_space_before,
token.containing_directive, token.previous)
if i + 1 < len(tokens):
tokens[i + 1].previous = tokens[i]
return defines + tokens