forked from AmadeusBugProject/artifact_detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSpecialCharacterToWords.py
125 lines (111 loc) · 3.63 KB
/
SpecialCharacterToWords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import re
from sklearn.base import BaseEstimator, TransformerMixin
class SpecialCharacterToWords(BaseEstimator, TransformerMixin):
def __init__(self, repl_all_caps=True):
self.repl_all_caps = repl_all_caps
def fit(self, X, y=None):
return self
def transform(self, X):
return [self.replace_special_char(i) for i in X]
def replace_special_char(self, text):
for rec in rec_map:
text = rec[0].sub(rec[1], text)
if self.repl_all_caps:
text = rec_all_caps[0].sub(rec_all_caps[1], text)
for repl in char_map:
text = text.replace(repl[0], repl[1])
for repl in unic_char_map:
text = text.replace(repl[0], repl[1])
return ' JJJnewline ' + text + ' JJJendline '
char_map = [
(' ', 'JJJquadspace'),
(' ', 'JJJdoublespace'),
# (' ', 'JJJspace'),
('JJJquadspace', ' JJJquadspace '),
('JJJdoublespace', ' JJJdoublespace '),
# ('JJJspace', ' JJJspace '),
('\n', ' JJJendline \n JJJnewline '),
('\t', ' JJJtabulator '),
('~', ' JJJtilde '),
('!', ' JJJexclamation '),
('@', ' JJJat '),
('####', ' JJJquadhash '),
('###', ' JJJtriplehash '),
('##', ' JJJdoublehash '),
('#', ' JJJsinglehash '),
('$', ' JJJdollar '),
('%', ' JJJpercent '),
('^', ' JJJhat '),
('&', ' JJJampersand '),
('*', ' JJJasterisk '),
('(', ' JJJroundbracketopen '),
(')', ' JJJroundbracketclose '),
('_', ' JJJunderscore '),
('-', ' JJJminus '),
('=', ' JJJequals '),
('+', ' JJJplus '),
('[', ' JJJsqarebracketopen '),
(']', ' JJJsqarebracketclose '),
('{', ' JJJcurlybracketopen '),
('}', ' JJJcurlybracketclose '),
(';', ' JJJsemicolon '),
(':', ' JJJcolon '),
("'", ' JJJsinglequote '),
('"', ' JJJquote '),
('\\', ' JJJbackslash '),
('|', ' JJJpipe '),
(',', ' JJJcomma '),
('<', ' JJJsmaller '),
('.', ' JJJdot '),
('>', ' JJJlarger '),
('/', ' JJJslash '),
('`', ' JJJbacktick '),
('?', ' JJJquestion ')]
unic_char_map = [
('’', ' JJJtick '),
('´', ' JJJtick '),
('‘', ' JJJtick '),
('“', ' JJJsquote '),
('”', ' JJJsquote '),
(':', ' JJJcolon '),
('﹕', ' JJJcolon '),
(',', ' JJJcomma '),
('…', ' JJJellipsis '),
('–', ' JJJminus '),
('—', ' JJJminus '),
('«', ' JJJpointybracketopen '),
('›', ' JJJpointybracketclose '),
('»', ' JJJpointybracketclose '),
('‹', ' JJJpointybracketopen '),
('👉', ' JJJunicodearrow '),
('→', ' JJJunicodearrow '),
('↓', ' JJJunicodearrow '),
('│', ' JJJunicodebox '),
('├', ' JJJunicodebox '),
('┌', ' JJJunicodebox '),
('└', ' JJJunicodebox '),
('┤', ' JJJunicodebox '),
('┐', ' JJJunicodebox '),
('┘', ' JJJunicodebox '),
('┬', ' JJJunicodebox '),
('─', ' JJJunicodebox '),
('║', ' JJJunicodebox '),
('╚', ' JJJunicodebox '),
('╔', ' JJJunicodebox '),
('═', ' JJJunicodebox '),
('╗', ' JJJunicodebox '),
('╝', ' JJJunicodebox '),
('§', ' JJJparagraph '),
('·', ' JJJitemize '),
('•', ' JJJitemize '),
('●', ' JJJitemize '),
('✔', ' JJJCheckmark '),
('✓', ' JJJCheckmark ')]
rec_map = [
(re.compile(r"(?:(?:[A-Z][a-z0-9]*)+(?:Exception|Error))"), ' JJJexception '),
(re.compile(r"(?:(?:[A-Z]?[a-z0-9]+)(?:[A-Z][a-z0-9]*)+)"), ' JJJcamelcased '),
(re.compile(r"(?:(?:\w+_)+\w+)"), ' JJJunderscored '),
(re.compile(r"(?:0x[a-f0-9]+)"), ' JJJhex '),
(re.compile(r"\d+"), ' JJJnumber '),
]
rec_all_caps = (re.compile(r"(?:[A-Z]{3,})"), ' JJJallcaps ')