Skip to content

Commit c33d402

Browse files
committed
1. Added safe sequences approach to hide complex links and e-mails, thus they don't get typographed.
2. Fixed the following issues: - issue #41 - link problem is now fixed - issue #42 - same as above - issue #45 - fixed the quot problem - issue #59 - fixed the special case auto_comma
1 parent 4beac72 commit c33d402

13 files changed

+329
-9
lines changed

src-php/EMT.Lib.php

+26
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,32 @@ public static function split_number($num) {
693693
return number_format($num, 0, '', ' ');
694694
}
695695

696+
// https://mathiasbynens.be/demo/url-regex
697+
// @gruber v2 (218 chars)
698+
public static function url_regex() {
699+
/*return <<<URLREGEX
700+
_(?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?!10(?:\.\d{1,3}){3})(?!127(?:\.\d{1,3}){3})(?!169\.254(?:\.\d{1,3}){2})(?!192\.168(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)*[a-z\x{00a1}-\x{ffff}0-9]+)(?:\.(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)*[a-z\x{00a1}-\x{ffff}0-9]+)*(?:\.(?:[a-z\x{00a1}-\x{ffff}]{2,})))(?::\d{2,5})?(?:/[^\s]*)?_iuS
701+
URLREGEX;
702+
*/
703+
return <<<URLREGEX
704+
#(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))#iS
705+
URLREGEX;
706+
707+
/*
708+
return <<<URLREGEX
709+
/([a-z][a-z0-9\*\-\.]*):\/\/(?:(?:(?:[\w\.\-\+!$&'\(\)*\+,;=]|%[0-9a-f]{2})+:)*(?:[\w\.\-\+%!$&'\(\)*\+,;=]|%[0-9a-f]{2})+@)?(?:(?:[a-z0-9\-\.]|%[0-9a-f]{2})+|(?:\[(?:[0-9a-f]{0,4}:)*(?:[0-9a-f]{0,4})\]))(?::[0-9]+)?(?:[\/|\?](?:[\w#!:\.\?\+=&@!$'~*,;\/\(\)\[\]\-]|%[0-9a-f]{2})*)?/xiS
710+
URLREGEX;
711+
*/
712+
}
713+
714+
// https://emailregex.com/
715+
public static function email_regex() {
716+
$z = <<<EMAILREGEX
717+
(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])
718+
EMAILREGEX;
719+
$z = '~'. str_replace('~', '\\'.'~', $z) . '~imS';
720+
return $z;
721+
}
696722
}
697723

698724
?>

src-php/EMT.Tret.Punctmark.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ class EMT_Tret_Punctmark extends EMT_Tret
1111
public $rules = array(
1212
'auto_comma' => array(
1313
'description' => 'Расстановка запятых перед а, но',
14-
'pattern' => '/([a-zа-яё])(\s|&nbsp;)(но|а)(\s|&nbsp;)/iu',
14+
'pattern' => '/([a-zа-яё])(\s|&nbsp;)(но|а)(\s|&nbsp;)/u',
1515
'replacement' => '\1,\2\3\4'
1616
),
1717
'punctuation_marks_limit' => array(

src-php/EMT.Tret.Quote.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class EMT_Tret_Quote extends EMT_Tret
2424

2525
'open_quote' => array(
2626
'description' => 'Открывающая кавычка',
27-
'pattern' => '/(^|\(|\s|\>|-)((\"|\\\")+)(\S+)/iue',
27+
'pattern' => '/(^|\(|\s|\>|-)((\"|\\\")+)(\S+?)/iue',
2828
'replacement' => '$m[1] . str_repeat(self::QUOTE_FIRS_OPEN, substr_count($m[2],"\"") ) . $m[4]'
2929
),
3030
'close_quote' => array(

src-php/EMT.Tret.Text.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class EMT_Tret_Text extends EMT_Tret
2424
),
2525
'email' => array(
2626
'description' => 'Выделение эл. почты из текста',
27-
'pattern' => '/(\s|^|\&nbsp\;|\()([a-z0-9\-\_\.]{2,})\@([a-z0-9\-\.]{2,})\.([a-z]{2,6})(\)|\s|\.|\,|\!|\?|$|\<)/e',
27+
'pattern' => '/(\s|^|\&nbsp\;|\()([a-z0-9\-\_\.]{2,})\@([a-z0-9\-\.]{2,})\.([a-z]{2,6})(\)|\s|\.|\,|\!|\?|$|\<)/ie',
2828
'replacement' => '$m[1] . $this->tag($m[2]."@".$m[3].".".$m[4], "a", array("href" => "mailto:".$m[2]."@".$m[3].".".$m[4])) . $m[5]'
2929
),
3030
'no_repeat_words' => array(

src-php/EMT.php

+129-2
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,9 @@ protected function debug($class, $place, &$after_text, $after_text_raw = "")
8080

8181

8282

83-
protected $_safe_blocks = array();
83+
protected $_safe_blocks = array();
84+
protected $_safe_sequences = array();
85+
protected $_safe_sequence_mark = "SAFESEQUENCENUM";
8486

8587

8688
/**
@@ -127,6 +129,38 @@ private function _add_safe_block($id, $open, $close, $tag)
127129
);
128130
}
129131

132+
/**
133+
* Добавление защищенного блока
134+
*
135+
* @param string $type тип последовательности
136+
* 0 - URL
137+
* 1 - почта
138+
* @param string $content реальное содержимое
139+
* @return void
140+
*/
141+
private function _add_safe_sequence($type, $content)
142+
{
143+
$this->_safe_sequences[] = array(
144+
'type' => $type,
145+
'content' => $content,
146+
);
147+
}
148+
149+
/**
150+
* Вычисляем тэг, которого нет в заданном тексте
151+
*
152+
* @return array
153+
*/
154+
protected function detect_safe_mark() {
155+
$seq = $this->_safe_sequence_mark;
156+
$i = 0;
157+
while(strpos($this->_text, $seq) !== false) {
158+
$seq = str_replace("SAFESEQUENCENUM","SAFESEQUENCE".$i."NUM", $this->_safe_sequence_mark);
159+
$i++;
160+
}
161+
$this->_safe_sequence_mark = $seq;
162+
}
163+
130164
/**
131165
* Список защищенных блоков
132166
*
@@ -137,6 +171,16 @@ public function get_all_safe_blocks()
137171
return $this->_safe_blocks;
138172
}
139173

174+
/**
175+
* Список защищенных последовательностей
176+
*
177+
* @return array
178+
*/
179+
public function get_all_safe_sequences()
180+
{
181+
return $this->_safe_sequences;
182+
}
183+
140184
/**
141185
* Удаленного блока по его номеру ключа
142186
*
@@ -150,7 +194,6 @@ public function remove_safe_block($id)
150194
}
151195
}
152196

153-
154197
/**
155198
* Добавление защищенного блока
156199
*
@@ -217,6 +260,81 @@ public function safe_blocks($text, $way, $show = true)
217260
return $text;
218261
}
219262

263+
/**
264+
* Кодирование УРЛа
265+
*
266+
* @param regex array $m
267+
* @return unknown
268+
*/
269+
function safe_sequence_url($m) {
270+
$id = count($this->_safe_sequences);
271+
$this->_add_safe_sequence(0, $m[0]);
272+
return "http://mdash.ru/A0".$this->_safe_sequence_mark.$id."ID";
273+
}
274+
275+
/**
276+
* Кодирование Почты
277+
*
278+
* @param regex array $m
279+
* @return unknown
280+
*/
281+
function safe_sequence_email($m) {
282+
$id = count($this->_safe_sequences);
283+
$this->_add_safe_sequence(1, $m[0]);
284+
return "A1".$this->_safe_sequence_mark.$id."[email protected]";
285+
}
286+
287+
/**
288+
* Декодирование УРЛа
289+
*
290+
* @param regex array $m
291+
* @return unknown
292+
*/
293+
function unsafe_sequence_url($m) {
294+
return $this->_safe_sequences[$m[1]]['content'];
295+
}
296+
297+
/**
298+
* Декодирование УРЛа с удалением http://
299+
*
300+
* @param regex array $m
301+
* @return unknown
302+
*/
303+
function unsafe_sequence_url_nohttp($m) {
304+
$z = $this->_safe_sequences[$m[1]]['content'];
305+
return preg_replace("~([^:]+)://~", "", $z);
306+
}
307+
308+
309+
/**
310+
* Декодирование Почты
311+
*
312+
* @param regex array $m
313+
* @return unknown
314+
*/
315+
function unsafe_sequence_email($m) {
316+
return $this->_safe_sequences[$m[1]]['content'];
317+
}
318+
319+
/**
320+
* Сохранение защищенных последовательностей
321+
*
322+
* @param string $text
323+
* @param bool $safe если true, то содержимое блоков будет сохранено, иначе - раскодировано.
324+
* @return string
325+
*/
326+
public function safe_sequences($text, $way, $show = true)
327+
{
328+
if(true === $way) {
329+
$text = preg_replace_callback(EMT_Lib::url_regex(), array($this, "safe_sequence_url") , $text);
330+
$text = preg_replace_callback(EMT_Lib::email_regex(), array($this, "safe_sequence_email") , $text);
331+
} else {
332+
$text = preg_replace_callback('~http://mdash.ru/A0'.$this->_safe_sequence_mark.'(\d+)ID~ims', array($this, "unsafe_sequence_url") , $text);
333+
$text = preg_replace_callback('~mdash.ru/A0'.$this->_safe_sequence_mark.'(\d+)ID~ims', array($this, "unsafe_sequence_url_nohttp") , $text);
334+
$text = preg_replace_callback('~A1'.$this->_safe_sequence_mark.'(\d+)[email protected]~ims', array($this, "unsafe_sequence_email") , $text);
335+
}
336+
return $text;
337+
}
220338

221339
/**
222340
* Декодирование блоков, которые были скрыты в момент типографирования
@@ -287,6 +405,8 @@ private function _init()
287405
$this->add_safe_block('span-notg', '<span class="_notg_start"></span>', '<span class="_notg_end"></span>');
288406
}
289407
$this->inited = true;
408+
409+
$this->detect_safe_mark();
290410
}
291411

292412

@@ -394,6 +514,9 @@ public function apply($trets = null)
394514

395515
$this->debug($this, 'init', $this->_text);
396516

517+
$this->_text = $this->safe_sequences($this->_text, true);
518+
$this->debug($this, 'safe_sequences', $this->_text);
519+
397520
$this->_text = $this->safe_blocks($this->_text, true);
398521
$this->debug($this, 'safe_blocks', $this->_text);
399522

@@ -403,6 +526,7 @@ public function apply($trets = null)
403526
$this->_text = EMT_Lib::clear_special_chars($this->_text);
404527
$this->debug($this, 'clear_special_chars', $this->_text);
405528

529+
406530
foreach ($atrets as $tret)
407531
{
408532
// если установлен режим разметки тэгов то выставим его
@@ -460,6 +584,9 @@ public function apply($trets = null)
460584
$this->_text = $this->safe_blocks($this->_text, false);
461585
$this->debug($this, 'unsafe_blocks', $this->_text);
462586

587+
$this->_text = $this->safe_sequences($this->_text, false);
588+
$this->debug($this, 'unsafe_sequences', $this->_text);
589+
463590
if(!$this->disable_notg_replace)
464591
{
465592
$repl = array('<span class="_notg_start"></span>', '<span class="_notg_end"></span>');

0 commit comments

Comments
 (0)