-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcore.js
208 lines (165 loc) · 7.77 KB
/
core.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
// Set correct replacement symbols for each language, including both primary and secondary marks
// These symbols are strings fed into the String.replace() method; they are not regex
// All matching patterns used (for both "context" and "specific" matches) are converted to regex
const languages = {
"EN": {
"correct_open": "“",
"correct_close": "”",
"secondary_open": "‘",
"secondary_close": "’"
},
"FR": {
"correct_open": "«\xa0", // include no-break spaces
"correct_close": "\xa0»",
"secondary_open": "“",
"secondary_close": "”"
},
"IT": {
"correct_open": "«",
"correct_close": "»",
"secondary_open": "“",
"secondary_close": "”"
},
"ES": {
"correct_open": "«",
"correct_close": "»",
"secondary_open": "“",
"secondary_close": "”"
},
"DE": {
"correct_open": "„",
"correct_close": "“", // use non-DE left double open; otherwise font won't match
"secondary_open": "‚", // NB. not the same as comma
"secondary_close": "‘" // use non-DE left single open; otherwise font won't match
},
"LT": {
"correct_open": "„",
"correct_close": "“", // use non-DE left double open; otherwise font won't match
"secondary_open": "‚", // NB. not the same as comma
"secondary_close": "‘" // use non-DE left single open; otherwise font won't match
},
"PL": {
"correct_open": "„",
"correct_close": "”",
"secondary_open": "«",
"secondary_close": "»"
}
}
// All possible quotation mark signs. We will use these to generate the relevant regex to find errors.
const all_marks = [
"\"", // straight double
"\'", // straight single
"“", // smart open double
"”", // smart close double
"‘", // smart open single
"’", // smart close single
"„", // German open double
"‟", // German close double // NB. not double left quotation mark
"‚", // German open single // NB. not the same as comma
"‛", // German close single // NB. not single left quotation mark
"«", // guillemets open no space
"»", // guillemets close no space
"«\\s", // guillemets open space (escape the esacape char for converting to regex)
"\\s»", // guillemets close space
"‹", // guillemets single open no space
"›", // guillemets single close no space
"‹\\s", // guillemets single open space
"\\s›" // guillemets single close space
]
// Define the possible opening and closing positions for each quotation mark sign. Used to generate the regex.
const opening_error_positions = [
"[^a-z]{sign}[a-z]", // Mark before a word begins; avoid apostrophes
"{sign}[¡¿][a-z]" // Spanish cases
]
const closing_error_positions = [
"[a-z]{sign}[^a-z]", // Mark after a word ends; avoid apostrophes
"[a-z][,.?!]{sign}" // Mark where quote ends with punctuation
]
// Remove a specific element from an array easily
const remove = (array,element) => {
let index = array.indexOf(element)
array.splice(index,1)
}
// We will populate an object with all required regex: each false sign in each possible position.
// The object's keys are the regex. Each key value is the specific sign that needs to be replaced.
// Depending on the language selected, the "correct" mark is removed from the set before generating.
const generateRegex = (position_set,correct_char) => {
let errors = {}
let marks = Array.from(all_marks)
remove(marks,correct_char)
marks.forEach(mark => {
position_set.forEach(position => {
let regex = position.replace("{sign}",mark)
errors[regex] = mark
})
})
return errors
}
// Find sections in text matching the regex stored in the object, then replace only the required sign
const findReplace = (text,regex_obj,correct_char) => {
for (const [regex,mark] of Object.entries(regex_obj)) {
let re = new RegExp(regex,"gim")
while ((match = re.exec(text)) != null){
let substring1 = text.substring(match.index,match.index+match[0].length)
let substring2 = substring1.replace((RegExp(mark,"gim")),correct_char) // convert "mark" to regex too
text = text.replace(substring1,substring2)
}
}
return text
}
// Primary + secondary quotes: first we replace *all* quotes with the correct open/close symbols
// Then we search for dual occurences of an open/close without the appropriate closing symbol between
const generateSecondaryRegex = (repeated_char,break_char) => {
let regex_obj = {}
// find two repeated characters separated by any number of characters that aren't the break character
const match_expression = repeated_char + "[^" + break_char + "]+" + repeated_char
regex_obj[match_expression] = repeated_char
return regex_obj
}
const replaceSecondaryQuotes = (text,regex_obj,correct_char,open_close) => {
let re = new RegExp(Object.keys(regex_obj)[0],"gim")
while ((match = re.exec(text)) != null){
let substring1
if (open_close == "open") {
substring1 = text.substring(match.index+1,match.index+match[0].length) // don't edit the first one
}
else if (open_close == "close") {
substring1 = text.substring(match.index,match.index+match[0].length-1) // don't edit the second one
}
let substring2 = substring1.replace((RegExp(Object.values(regex_obj)[0],"gim")),correct_char)
text = text.replace(substring1,substring2)
}
return text
}
// Define regex and replacements for various punctuation characters + use our function to fix them
const fixPunctuation = (text) => {
const regex_obj_apostrophe = {"[a-z]\'[a-z]": "\'"} // regex to catch the desired replacement chars in situ
const regex_obj_dash = {"\\s-\\s": "-"} // also define specific replacement characters as regex
const regex_obj_emdash = {"—": "—"} // because they get converted in findReplace()
const regex_obj_ellipsis = {"\\.{3}": "\\.{3}"} // remember to escape the escape characters!
const regex_obj_ellipsis_brackets = {"\\(…\\)": "\\(…\\)"}
text = findReplace(text,regex_obj_apostrophe,"’") // define replacement chars as string, not as regex
text = findReplace(text,regex_obj_dash,"–") // replace spaced dashes with spaced en-dash
text = findReplace(text,regex_obj_emdash," – ") // replace unspaced em-dash with spaced en-dash
text = findReplace(text,regex_obj_ellipsis,"…")
text = findReplace(text,regex_obj_ellipsis_brackets,"[…]")
return text
}
// Master function to apply everything to a source + language
const fixAll = (source,language) => {
const correct_open = languages[language].correct_open
const correct_close = languages[language].correct_close
const secondary_open = languages[language].secondary_open
const secondary_close = languages[language].secondary_close
const open_quote_errors = generateRegex(opening_error_positions,correct_open)
const close_quote_errors = generateRegex(closing_error_positions,correct_close)
const secondary_open_regex = generateSecondaryRegex(correct_open,correct_close)
const secondary_close_regex = generateSecondaryRegex(correct_close,correct_open)
fixed_text = findReplace(source,open_quote_errors,correct_open)
fixed_text = findReplace(fixed_text,close_quote_errors,correct_close)
fixed_text = replaceSecondaryQuotes(fixed_text,secondary_open_regex,secondary_open,"open")
fixed_text = replaceSecondaryQuotes(fixed_text,secondary_close_regex,secondary_close,"close")
fixed_text = fixPunctuation(fixed_text)
return fixed_text
}
module.exports.fixAll = fixAll