Skip to content

Commit bb15057

Browse files
committed
#13 Implement IMAGE_LINK parsing.
1 parent 1ff9058 commit bb15057

File tree

8 files changed

+196
-31
lines changed

8 files changed

+196
-31
lines changed

CFG.txt

+5-6
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ PARAGRAPH -> RUN "\n" PARAGRAPH #
6767
/ empty # (i.e. RUN \n RUN \n)
6868

6969
RUN -> INLINE+
70-
INLINE -> EXPANDABLE / TAG / FILE_LINK / WIKI_LINK / EXTERNAL_LINK / FORMAT / PLAIN_TEXT
70+
INLINE -> EXPANDABLE / TAG / IMAGE_LINK / WIKI_LINK / EXTERNAL_LINK / FORMAT / PLAIN_TEXT
7171
EXPANDABLE -> COMMENT / TEMPLATE / ARGUMENT_REF # COMMENT expands to ""
7272

7373
EXPANDABLE_TEXT -> EXPANDABLE EXPANDABLE_TEXT
@@ -84,12 +84,11 @@ EXPANDABLE_URL -> EXPANDABLE EXPANDABLE_URL
8484
# Syntax for inserting an image or thumbnail
8585
# c.f. https://www.mediawiki.org/wiki/Help:Images#Rendering_a_single_image
8686
IMAGE_LINK -> "[[" IMAGE_LINK_TARGET IMAGE_LINK_ARGUMENT* "]]"
87-
>| "]]" # IMAGE_LINK_OPTION and IMAGE_LINK_CAPTION can have \n inside.
88-
IMAGE_LINK_TARGET -> "[\s_]*(File|Image|<...>)[\s_]*:" EXPANDABLE_TEXT
89-
+| "|" # <...> is customizable File namespace aliases.
87+
>| "]]|\|"
88+
IMAGE_LINK_TARGET -> "[\s_]*(File|Image|<...>)[\s_]*:" EXPANDABLE_TEXT # <...> is customizable File namespace aliases.
89+
# IMAGE_LINK_ARGUMENT can have \n inside.
90+
# IMAGE_LINK_CAPTION is actually the last IMAGE_LINK_ARGUMENT
9091
IMAGE_LINK_ARGUMENT -> "|" WIKITEXT "=" WIKITEXT
91-
/ "|" WIKITEXT # IMAGE_LINK_CAPTION is actually the last IMAGE_LINK_ARGUMENT
92-
+| "|" / "]]"
9392

9493
# Known issue: Current implementation will parse [[http://abc]] as WIKI_LINK,
9594
# while actually it should be trated as "[" EXTERNAL_LINK "]"

MwParserFromScratch/Nodes/Inline.cs

+23-3
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ internal override void ToPlainTextCore(StringBuilder builder, NodePlainTextForma
5656
}
5757
}
5858

59+
/// <summary>
60+
/// <c>[[Target|text]]</c>
61+
/// </summary>
5962
public class WikiLink : InlineNode
6063
{
6164
private Run _Target;
@@ -88,9 +91,7 @@ protected override Node CloneCore()
8891
}
8992

9093
public override string ToString() => Text == null ? $"[[{Target}]]" : $"[[{Target}|{Text}]]";
91-
92-
private static readonly Regex PipeTrickTitleMatcher = new Regex(@".+?(?=\w*\()");
93-
94+
9495
/// <param name="builder"></param>
9596
/// <param name="formatter"></param>
9697
/// <inheritdoc />
@@ -171,6 +172,25 @@ protected override Node CloneCore()
171172
return new WikiImageLink(Target) { Arguments = { Arguments } };
172173
}
173174

175+
/// <inheritdoc />
176+
public override string ToString()
177+
{
178+
var sb = new StringBuilder("[[", 16 + Arguments.Count * 4);
179+
sb.Append(_Target);
180+
foreach (var arg in Arguments)
181+
{
182+
sb.Append('|');
183+
if (arg.Name != null)
184+
{
185+
sb.Append(arg.Name);
186+
sb.Append('=');
187+
}
188+
sb.Append(arg.Value);
189+
}
190+
sb.Append("]]");
191+
return sb.ToString();
192+
}
193+
174194
/// <inheritdoc />
175195
internal override void ToPlainTextCore(StringBuilder builder, NodePlainTextFormatter formatter)
176196
{

MwParserFromScratch/ParserCore.Basic.cs

+62-2
Original file line numberDiff line numberDiff line change
@@ -393,17 +393,71 @@ private bool ParseRun(RunParsingMode mode, IInlineContainer container, bool setL
393393
private InlineNode ParseInline()
394394
{
395395
return ParseTag()
396+
?? ParseImageLink()
396397
?? ParseWikiLink()
397398
?? ParseExternalLink()
398399
?? ParseFormatSwitch()
399-
?? (InlineNode)ParsePartialPlainText();
400+
?? (InlineNode) ParsePartialPlainText();
400401
}
401402

402403
private InlineNode ParseExpandable()
403404
{
404405
return ParseComment() ?? ParseBraces();
405406
}
406407

408+
private WikiImageLink ParseImageLink()
409+
{
410+
if (LookAheadToken(@"\[\[") == null) return null;
411+
// Check namespace prefix.
412+
if (LookAheadToken(@"\[\[[\s_]*(?i:" + options.ImageNamespaceRegexp + @")[\s_]*:") == null) return null;
413+
ParseStart(@"\||\]\]", true);
414+
if (ConsumeToken(@"\[\[") == null) return ParseFailed<WikiImageLink>();
415+
// IMAGE_LINK_TARGET
416+
var target = new Run();
417+
// No nested link expression or line break inside IMAGE_LINK_TARGET
418+
ParseStart(@"\[\[|\n", false);
419+
if (!ParseRun(RunParsingMode.ExpandableText, target, true))
420+
{
421+
Fallback();
422+
return ParseFailed<WikiImageLink>();
423+
}
424+
Accept();
425+
var node = new WikiImageLink(target);
426+
// IMAGE_LINK_ARGUMENT
427+
while (ConsumeToken(@"\|") != null)
428+
{
429+
var arg = ParseImageLinkArgument();
430+
node.Arguments.Add(arg);
431+
}
432+
if (ConsumeToken(@"\]\]") == null)
433+
{
434+
if (options.AllowClosingMarkInference)
435+
node.SetInferredClosingMark();
436+
else
437+
return ParseFailed<WikiImageLink>();
438+
}
439+
return ParseSuccessful(node);
440+
}
441+
442+
/// <summary>
443+
/// IMAGE_LINK_ARGUMENT
444+
/// </summary>
445+
private WikiImageLinkArgument ParseImageLinkArgument()
446+
{
447+
ParseStart(@"=", false);
448+
var a = ParseWikitext();
449+
Debug.Assert(a != null);
450+
if (ConsumeToken(@"=") != null)
451+
{
452+
// name=value
453+
CurrentContext.Terminator = null;
454+
var value = ParseWikitext();
455+
Debug.Assert(value != null);
456+
return ParseSuccessful(new WikiImageLinkArgument(a, value));
457+
}
458+
return ParseSuccessful(new WikiImageLinkArgument(null, a));
459+
}
460+
407461
private WikiLink ParseWikiLink()
408462
{
409463
// Note that wikilink cannot nest itself.
@@ -428,7 +482,13 @@ private WikiLink ParseWikiLink()
428482
if (ParseRun(RunParsingMode.ExpandableText, text, true))
429483
node.Text = text;
430484
}
431-
if (ConsumeToken(@"\]\]") == null) return ParseFailed<WikiLink>();
485+
if (ConsumeToken(@"\]\]") == null)
486+
{
487+
if (options.AllowClosingMarkInference)
488+
node.SetInferredClosingMark();
489+
else
490+
return ParseFailed<WikiLink>();
491+
}
432492
return ParseSuccessful(node);
433493
}
434494

MwParserFromScratch/ParserCore.cs

+2-5
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,9 @@ private T ParseSuccessful<T>(T value, bool setLineNumber = true) where T : Node
135135
/// Accept the characters consumed in the current context.
136136
/// You won't be able to backtrack to the position where last <see cref="ParseStart()"/> call takes place.
137137
/// </summary>
138-
/// <returns></returns>
139-
private bool Accept()
138+
private void Accept()
140139
{
141140
contextStack.Pop();
142-
return true;
143141
}
144142

145143
private T ParseFailed<T>(T node = default(T)) where T : Node
@@ -148,15 +146,14 @@ private bool Accept()
148146
return default(T);
149147
}
150148

151-
private bool Fallback()
149+
private void Fallback()
152150
{
153151
logger?.NotifyFallback(position, contextStack.Count);
154152
var context = contextStack.Pop();
155153
// Fallback
156154
position = context.StartingPosition;
157155
lineNumber = context.StartingLineNumber;
158156
linePosition = context.StartingLinePosition;
159-
return false;
160157
}
161158

162159
private bool BeginningOfLine()

MwParserFromScratch/WikitextParserOptions.cs

+75-9
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
using System.Collections.ObjectModel;
44
using System.Linq;
55
using System.Text;
6+
using System.Text.RegularExpressions;
67
using System.Threading.Tasks;
78
using MwParserFromScratch.Nodes;
89

@@ -32,6 +33,11 @@ public class WikitextParserOptions
3233
"br", "wbr", "hr", "meta", "link"
3334
});
3435

36+
public static readonly IReadOnlyList<string> DefaultImageNamespaceNames = new ReadOnlyCollection<string>(new[]
37+
{
38+
"File", "Image"
39+
});
40+
3541
internal static readonly HashSet<string> DefaultCaseInsensitiveMagicTemplatesSet =
3642
new HashSet<string>(StringComparer.OrdinalIgnoreCase)
3743
{
@@ -198,6 +204,7 @@ public class WikitextParserOptions
198204
"DEFAULTCATEGORYSORT",
199205
"PAGESINNS"
200206
};
207+
201208
#endregion
202209

203210
public static readonly IReadOnlyList<MagicTemplateNameInfo> DefaultMagicTemplateNames
@@ -207,12 +214,20 @@ public static readonly IReadOnlyList<MagicTemplateNameInfo> DefaultMagicTemplate
207214
.ToArray());
208215

209216
private static readonly HashSet<string> DefaultParserTagsSet = new HashSet<string>(DefaultParserTags, StringComparer.OrdinalIgnoreCase);
210-
private static readonly HashSet<string> DefaultSelfClosingOnlyTagsSet = new HashSet<string>(DefaultSelfClosingOnlyTags, StringComparer.OrdinalIgnoreCase);
217+
218+
private static readonly HashSet<string> DefaultSelfClosingOnlyTagsSet =
219+
new HashSet<string>(DefaultSelfClosingOnlyTags, StringComparer.OrdinalIgnoreCase);
220+
221+
private static readonly HashSet<string> DefaultImageNamespaceNamesSet =
222+
new HashSet<string>(DefaultImageNamespaceNames, StringComparer.OrdinalIgnoreCase);
223+
224+
private static readonly string DefaultImageNamespaceNameRegexp = string.Join("|", DefaultImageNamespaceNames.Select(Regex.Escape));
211225

212226
internal static WikitextParserOptions DefaultOptionsCopy = new WikitextParserOptions().DefensiveCopy();
213227

214228
private IEnumerable<string> _ParserTags;
215229
private IEnumerable<string> _SelfClosingOnlyTags;
230+
private IEnumerable<string> _ImageNamespaceNames;
216231
private IEnumerable<MagicTemplateNameInfo> _MagicTemplateNames;
217232
private bool _AllowEmptyTemplateName;
218233
private bool _AllowEmptyWikiLinkTarget;
@@ -276,7 +291,11 @@ public IEnumerable<MagicTemplateNameInfo> MagicTemplateNames
276291
public bool AllowEmptyTemplateName
277292
{
278293
get { return _AllowEmptyTemplateName; }
279-
set { _AllowEmptyTemplateName = value; _DefensiveCopy = null; }
294+
set
295+
{
296+
_AllowEmptyTemplateName = value;
297+
_DefensiveCopy = null;
298+
}
280299
}
281300

282301
/// <summary>
@@ -286,7 +305,11 @@ public bool AllowEmptyTemplateName
286305
public bool AllowEmptyWikiLinkTarget
287306
{
288307
get { return _AllowEmptyWikiLinkTarget; }
289-
set { _AllowEmptyWikiLinkTarget = value; _DefensiveCopy = null; }
308+
set
309+
{
310+
_AllowEmptyWikiLinkTarget = value;
311+
_DefensiveCopy = null;
312+
}
290313
}
291314

292315
/// <summary>
@@ -299,22 +322,49 @@ public bool AllowEmptyWikiLinkTarget
299322
public bool AllowEmptyExternalLinkTarget
300323
{
301324
get { return _AllowEmptyExternalLinkTarget; }
302-
set { _AllowEmptyExternalLinkTarget = value; _DefensiveCopy = null; }
325+
set
326+
{
327+
_AllowEmptyExternalLinkTarget = value;
328+
_DefensiveCopy = null;
329+
}
303330
}
304331

305332
/// <summary>
306-
/// When parsing for wikilinks, templates, and HTML tags, allows inference of missing close marks.
333+
/// When parsing for wikilinks and templates, allows inference of missing close marks. Defaults to <c>false</c>.
307334
/// </summary>
308335
public bool AllowClosingMarkInference
309336
{
310337
get { return _AllowClosingMarkInference; }
311-
set { _AllowClosingMarkInference = value; _DefensiveCopy = null; }
338+
set
339+
{
340+
_AllowClosingMarkInference = value;
341+
_DefensiveCopy = null;
342+
}
343+
}
344+
345+
/// <summary>
346+
/// Namespace names that will cause <see cref="WikiLink"/> expression parsed as <see cref="WikiImageLink"/> expression.
347+
/// </summary>
348+
/// <value>A list of namespace names. OR <c>null</c> to use the default settings. Name comparison is case-insensitive.</value>
349+
/// <remarks>Default value is <c>["File", "Image"]</c>.</remarks>
350+
public IEnumerable<string> ImageNamespaceNames
351+
{
352+
get { return _ImageNamespaceNames; }
353+
set
354+
{
355+
_ImageNamespaceNames = value;
356+
_DefensiveCopy = null;
357+
}
312358
}
313359

314360
public bool WithLineInfo
315361
{
316362
get { return _WithLineInfo; }
317-
set { _WithLineInfo = value; _DefensiveCopy = null; }
363+
set
364+
{
365+
_WithLineInfo = value;
366+
_DefensiveCopy = null;
367+
}
318368
}
319369

320370
internal ISet<string> ParserTagsSet => (ISet<string>) ParserTags;
@@ -325,6 +375,10 @@ public bool WithLineInfo
325375

326376
internal ISet<string> CaseInsensitiveMagicTemplateNamesSet { get; private set; }
327377

378+
internal ISet<string> ImageNamespaceNamesSet => (ISet<string>) ImageNamespaceNames;
379+
380+
internal string ImageNamespaceRegexp { get; private set; }
381+
328382
internal WikitextParserOptions DefensiveCopy()
329383
{
330384
// This method should be thread-safe when there are concurrent DefensiveCopy calls.
@@ -334,10 +388,21 @@ internal WikitextParserOptions DefensiveCopy()
334388
inst._ParserTags = ParserTags == null || ReferenceEquals(ParserTags, DefaultParserTags)
335389
? DefaultParserTagsSet
336390
: new HashSet<string>(ParserTags, StringComparer.OrdinalIgnoreCase);
337-
inst._SelfClosingOnlyTags = SelfClosingOnlyTags == null ||
338-
ReferenceEquals(SelfClosingOnlyTags, DefaultSelfClosingOnlyTags)
391+
inst._SelfClosingOnlyTags = SelfClosingOnlyTags == null || ReferenceEquals(SelfClosingOnlyTags, DefaultSelfClosingOnlyTags)
339392
? DefaultSelfClosingOnlyTagsSet
340393
: new HashSet<string>(SelfClosingOnlyTags, StringComparer.OrdinalIgnoreCase);
394+
if (ImageNamespaceNames == null || ReferenceEquals(ImageNamespaceNames, DefaultImageNamespaceNames))
395+
{
396+
inst._ImageNamespaceNames = DefaultImageNamespaceNamesSet;
397+
inst.ImageNamespaceRegexp = DefaultImageNamespaceNameRegexp;
398+
}
399+
else
400+
{
401+
var collection = ImageNamespaceNames as ICollection<string> ?? ImageNamespaceNames.ToList();
402+
inst.ImageNamespaceRegexp = string.Join("|", collection.Select(Regex.Escape));
403+
inst._ImageNamespaceNames = new HashSet<string>(collection, StringComparer.OrdinalIgnoreCase);
404+
}
405+
341406
if (inst.MagicTemplateNames == null || ReferenceEquals(MagicTemplateNames, DefaultMagicTemplateNames))
342407
{
343408
inst.CaseSensitiveMagicTemplateNamesSet = DefaultCaseSensitiveMagicTemplatesSet;
@@ -353,6 +418,7 @@ internal WikitextParserOptions DefensiveCopy()
353418
else inst.CaseInsensitiveMagicTemplateNamesSet.Add(tn.Name);
354419
}
355420
}
421+
356422
inst._MagicTemplateNames = null;
357423
_DefensiveCopy = inst;
358424
return inst;

UnitTestProject1/BasicParsingTests.cs

+9-5
Original file line numberDiff line numberDiff line change
@@ -148,14 +148,18 @@ public void TestWikiLink2()
148148
}
149149

150150
[Fact]
151-
public void TestWikiLink3()
151+
public void TestWikiImageLink()
152152
{
153153
var root = ParseAndAssert(
154+
"[[File:example.jpg|link=http://wikipedia.org/wiki/Test|thumb|upright|caption|caption2]]",
155+
"P[{{File:example.jpg|P[link]=P[-[http://wikipedia.org/wiki/Test]-]|P[thumb]|P[upright]|P[caption]|P[caption2]]]]");
156+
Assert.Equal("caption2", root.EnumDescendants().OfType<WikiImageLink>().First().Arguments.Caption.ToString());
157+
Assert.Equal("http://wikipedia.org/wiki/Test", root.EnumDescendants().OfType<WikiImageLink>().First().Arguments.Link.ToString());
158+
root = ParseAndAssert(
154159
"[[Bestand:Bundesarchiv Bild 146III-373, Modell der Neugestaltung Berlins (\"Germania\").jpg|miniatuur|260px|right| Schaalmodel van de [[Welthauptstadt Germania]], 1939]]",
155-
"P[[[Duck|ducks]]\n]");
156-
//var link = (WikiLink)root.Lines.First().EnumChildren().First();
157-
//Assert.Equal("Duck", link.Target.ToString());
158-
//Assert.Equal("ducks", link.Text.ToString());
160+
"P[{{Bestand:Bundesarchiv Bild 146III-373, Modell der Neugestaltung Berlins (\"Germania\").jpg|P[miniatuur]|P[260px]|P[right]|P[ Schaalmodel van de [[Welthauptstadt Germania]], 1939]]]]",
161+
new WikitextParserOptions { ImageNamespaceNames = new[] { "File", "Image", "bestand" } });
162+
Assert.Equal(" Schaalmodel van de Welthauptstadt Germania, 1939", root.ToPlainText());
159163
}
160164

161165
[Fact]

UnitTestProject1/Primitive/ParserTestBase.cs

+2-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ public Wikitext ParseAndAssert(string text, string expectedDump, WikitextParserO
5757
Output.WriteLine(EscapeString(rootExpr));
5858
Output.WriteLine("=============================");
5959
Assert.Equal(EscapeString(expectedDump), EscapeString(rootExpr));
60-
if (!options.AllowClosingMarkInference) Assert.Equal(text, parsedText);
60+
if (!options.AllowClosingMarkInference)
61+
Assert.Equal(text, parsedText);
6162
return root;
6263
}
6364

0 commit comments

Comments
 (0)