Skip to content

Commit bdcd104

Browse files
oleibmanProgi1984
authored andcommitted
Improve ODText Content Reader
Fix #2493. There is much that the ODT Reader ignores. This change adds support for the `text:section`, `text:span`, `text:s`, and `text:tab` tags, thereby handling multiple sections, text runs, tab characters, and multiple spaces. There will still be many omissions (e.g. styles and tables), but you will now often be able to access the text content of valid ODT documents. The issue suggests variations in a simple file created on its own by LibreOffice, and a similar file created by PhpWord. Both are unit-tested. A `getText` method is added to TextRun to facilitate testing (and can be useful on its own). It will return the concatenated texts of all elements of the text run.
1 parent b0e1e41 commit bdcd104

File tree

6 files changed

+171
-11
lines changed

6 files changed

+171
-11
lines changed

docs/changes/1.x/1.2.0.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
- Added Support for Language, both for document overall and individual text elements
3232
- Template : Set a checkbox by [@nxtpge](https://github.com/nxtpge) in [#2509](https://github.com/PHPOffice/PHPWord/pull/2509)
3333
- ODText / RTF / Word2007 Writer : Add field FILENAME by [@milkyway-git](https://github.com/milkyway-git) in [#2510](https://github.com/PHPOffice/PHPWord/pull/2510)
34+
- ODText Reader : Improve Section Reader by [@oleibman](https://github.com/oleibman) in [#2507](https://github.com/PHPOffice/PHPWord/pull/2507)
3435

3536
### Bug fixes
3637

phpstan-baseline.neon

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -165,11 +165,6 @@ parameters:
165165
count: 1
166166
path: src/PhpWord/Reader/HTML.php
167167

168-
-
169-
message: "#^Call to an undefined method DOMNode\\:\\:getAttribute\\(\\)\\.$#"
170-
count: 2
171-
path: src/PhpWord/Reader/ODText/Content.php
172-
173168
-
174169
message: "#^Offset 'textNodes' on array\\{changed\\: PhpOffice\\\\PhpWord\\\\Element\\\\TrackChange, textNodes\\: DOMNodeList\\<DOMElement\\>\\} in isset\\(\\) always exists and is not nullable\\.$#"
175170
count: 1

src/PhpWord/Element/TextRun.php

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,16 @@ public function setParagraphStyle($style = null)
7878

7979
return $this->paragraphStyle;
8080
}
81+
82+
public function getText(): string
83+
{
84+
$outstr = '';
85+
foreach ($this->getElements() as $element) {
86+
if ($element instanceof Text) {
87+
$outstr .= $element->getText();
88+
}
89+
}
90+
91+
return $outstr;
92+
}
8193
}

src/PhpWord/Reader/ODText/Content.php

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@
1818
namespace PhpOffice\PhpWord\Reader\ODText;
1919

2020
use DateTime;
21+
use DOMElement;
22+
use DOMNodeList;
2123
use PhpOffice\Math\Reader\MathML;
24+
use PhpOffice\PhpWord\Element\Section;
2225
use PhpOffice\PhpWord\Element\TrackChange;
2326
use PhpOffice\PhpWord\PhpWord;
2427
use PhpOffice\PhpWord\Shared\XMLReader;
@@ -30,6 +33,9 @@
3033
*/
3134
class Content extends AbstractPart
3235
{
36+
/** @var ?Section */
37+
private $section;
38+
3339
/**
3440
* Read content.xml.
3541
*/
@@ -41,17 +47,28 @@ public function read(PhpWord $phpWord): void
4147
$trackedChanges = [];
4248

4349
$nodes = $xmlReader->getElements('office:body/office:text/*');
50+
$this->section = null;
51+
$this->processNodes($nodes, $xmlReader, $phpWord);
52+
$this->section = null;
53+
}
54+
55+
/** @param DOMNodeList<DOMElement> $nodes */
56+
public function processNodes(DOMNodeList $nodes, XMLReader $xmlReader, PhpWord $phpWord): void
57+
{
4458
if ($nodes->length > 0) {
45-
$section = $phpWord->addSection();
4659
foreach ($nodes as $node) {
4760
// $styleName = $xmlReader->getAttribute('text:style-name', $node);
4861
switch ($node->nodeName) {
4962
case 'text:h': // Heading
5063
$depth = $xmlReader->getAttribute('text:outline-level', $node);
51-
$section->addTitle($node->nodeValue, $depth);
64+
$this->getSection($phpWord)->addTitle($node->nodeValue, $depth);
5265

5366
break;
5467
case 'text:p': // Paragraph
68+
$styleName = $xmlReader->getAttribute('text:style-name', $node);
69+
if (substr($styleName, 0, 2) === 'SB') {
70+
break;
71+
}
5572
$element = $xmlReader->getElement('draw:frame/draw:object', $node);
5673
if ($element) {
5774
$mathFile = str_replace('./', '', $element->getAttribute('xlink:href')) . '/content.xml';
@@ -65,11 +82,13 @@ public function read(PhpWord $phpWord): void
6582
$reader = new MathML();
6683
$math = $reader->read($mathXML);
6784

68-
$section->addFormula($math);
85+
$this->getSection($phpWord)->addFormula($math);
6986
}
7087
}
7188
} else {
7289
$children = $node->childNodes;
90+
$spans = false;
91+
/** @var DOMElement $child */
7392
foreach ($children as $child) {
7493
switch ($child->nodeName) {
7594
case 'text:change-start':
@@ -89,16 +108,49 @@ public function read(PhpWord $phpWord): void
89108
$changed = $trackedChanges[$changeId];
90109
}
91110

111+
break;
112+
case 'text:span':
113+
$spans = true;
114+
92115
break;
93116
}
94117
}
95118

96-
$element = $section->addText($node->nodeValue);
119+
if ($spans) {
120+
$element = $this->getSection($phpWord)->addTextRun();
121+
foreach ($children as $child) {
122+
switch ($child->nodeName) {
123+
case 'text:span':
124+
/** @var DOMElement $child2 */
125+
foreach ($child->childNodes as $child2) {
126+
switch ($child2->nodeName) {
127+
case '#text':
128+
$element->addText($child2->nodeValue);
129+
130+
break;
131+
case 'text:tab':
132+
$element->addText("\t");
133+
134+
break;
135+
case 'text:s':
136+
$spaces = (int) $child2->getAttribute('text:c') ?: 1;
137+
$element->addText(str_repeat(' ', $spaces));
138+
139+
break;
140+
}
141+
}
142+
143+
break;
144+
}
145+
}
146+
} else {
147+
$element = $this->getSection($phpWord)->addText($node->nodeValue);
148+
}
97149
if (isset($changed) && is_array($changed)) {
98150
$element->setTrackChange($changed['changed']);
99151
if (isset($changed['textNodes'])) {
100152
foreach ($changed['textNodes'] as $changedNode) {
101-
$element = $section->addText($changedNode->nodeValue);
153+
$element = $this->getSection($phpWord)->addText($changedNode->nodeValue);
102154
$element->setTrackChange($changed['changed']);
103155
}
104156
}
@@ -110,7 +162,7 @@ public function read(PhpWord $phpWord): void
110162
$listItems = $xmlReader->getElements('text:list-item/text:p', $node);
111163
foreach ($listItems as $listItem) {
112164
// $listStyleName = $xmlReader->getAttribute('text:style-name', $listItem);
113-
$section->addListItem($listItem->nodeValue, 0);
165+
$this->getSection($phpWord)->addListItem($listItem->nodeValue, 0);
114166
}
115167

116168
break;
@@ -129,9 +181,26 @@ public function read(PhpWord $phpWord): void
129181
$trackedChanges[$changedRegion->getAttribute('text:id')] = ['changed' => $changed, 'textNodes' => $textNodes];
130182
}
131183

184+
break;
185+
case 'text:section': // Section
186+
// $sectionStyleName = $xmlReader->getAttribute('text:style-name', $listItem);
187+
$this->section = $phpWord->addSection();
188+
$children = $node->childNodes;
189+
$this->processNodes($children, $xmlReader, $phpWord);
190+
132191
break;
133192
}
134193
}
135194
}
136195
}
196+
197+
private function getSection(PhpWord $phpWord): Section
198+
{
199+
$section = $this->section;
200+
if ($section === null) {
201+
$section = $this->section = $phpWord->addSection();
202+
}
203+
204+
return $section;
205+
}
137206
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
<?php
2+
/**
3+
* This file is part of PHPWord - A pure PHP library for reading and writing
4+
* word processing documents.
5+
*
6+
* PHPWord is free software distributed under the terms of the GNU Lesser
7+
* General Public License version 3 as published by the Free Software Foundation.
8+
*
9+
* For the full copyright and license information, please read the LICENSE
10+
* file that was distributed with this source code. For the full list of
11+
* contributors, visit https://github.com/PHPOffice/PHPWord/contributors.
12+
*
13+
* @see https://github.com/PHPOffice/PHPWord
14+
*
15+
* @license http://www.gnu.org/licenses/lgpl.txt LGPL version 3
16+
*/
17+
18+
namespace PhpOffice\PhpWordTests\Reader\ODText;
19+
20+
use PhpOffice\PhpWord\IOFactory;
21+
use PhpOffice\PhpWord\PhpWord;
22+
use PhpOffice\PhpWord\Settings;
23+
24+
class ODTextSectionTest extends \PHPUnit\Framework\TestCase
25+
{
26+
/** @var string */
27+
private $filename = '';
28+
29+
protected function tearDown(): void
30+
{
31+
if ($this->filename !== '') {
32+
unlink($this->filename);
33+
$this->filename = '';
34+
}
35+
}
36+
37+
public function testWriteThenReadSection(): void
38+
{
39+
$dir = 'tests/PhpWordTests/_files';
40+
Settings::setOutputEscapingEnabled(true);
41+
$phpWord = new PhpWord();
42+
$section = $phpWord->addSection();
43+
$inputText = ['days', 'monday', 'tuesday'];
44+
$inputText[] = "Tab\tthen two spaces then done.";
45+
foreach ($inputText as $text) {
46+
$section->addText($text);
47+
}
48+
$writer = IOFactory::createWriter($phpWord, 'ODText');
49+
$this->filename = "$dir/sectiontest.odt";
50+
$writer->save($this->filename);
51+
52+
$reader = IOFactory::createReader('ODText');
53+
$phpWord2 = $reader->load($this->filename);
54+
$outputText = [];
55+
foreach ($phpWord2->getSections() as $section) {
56+
foreach ($section->getElements() as $element) {
57+
if (is_object($element) && method_exists($element, 'getText')) {
58+
$outputText[] = $element->getText();
59+
}
60+
}
61+
}
62+
self::assertSame($inputText, $outputText);
63+
}
64+
65+
public function testReadNoSections(): void
66+
{
67+
$dir = 'tests/PhpWordTests/_files/documents';
68+
$inputText = ['days', 'monday', 'tuesday'];
69+
70+
$reader = IOFactory::createReader('ODText');
71+
$filename = "$dir/word.2493.nosection.odt";
72+
$phpWord2 = $reader->load($filename);
73+
$outputText = [];
74+
foreach ($phpWord2->getSections() as $section) {
75+
foreach ($section->getElements() as $element) {
76+
if (is_object($element) && method_exists($element, 'getText')) {
77+
$outputText[] = $element->getText();
78+
}
79+
}
80+
}
81+
self::assertSame($inputText, $outputText);
82+
}
83+
}
Binary file not shown.

0 commit comments

Comments
 (0)