Skip to content

Commit ef5bd83

Browse files
authored
feat: preserve and reproduce podcast feeds (itunes rss module) (RSS-Bridge#3759)
1 parent 408c2e5 commit ef5bd83

7 files changed

+121
-80
lines changed

bridges/CssSelectorFeedExpanderBridge.php

+3-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,9 @@ public function collectData()
5050
$discard_thumbnail = $this->getInput('discard_thumbnail');
5151
$limit = $this->getInput('limit');
5252

53-
$source_feed = (new FeedParser())->parseFeed(getContents($url));
53+
$feedParser = new FeedParser();
54+
$xml = getContents($url);
55+
$source_feed = $feedParser->parseFeed($xml);
5456
$items = $source_feed['items'];
5557

5658
// Map Homepage URL (Default: Root page)

bridges/NyaaTorrentsBridge.php

+9-34
Original file line numberDiff line numberDiff line change
@@ -62,52 +62,27 @@ class NyaaTorrentsBridge extends BridgeAbstract
6262

6363
public function collectData()
6464
{
65-
// Manually parsing because we need to acccess the nyaa namespace in the xml
66-
$xml = simplexml_load_string(getContents($this->getURI()));
67-
$channel = $xml->channel[0];
68-
$feed = [];
69-
$feed['title'] = trim((string)$channel->title);
70-
$feed['uri'] = trim((string)$channel->link);
71-
if (!empty($channel->image)) {
72-
$feed['icon'] = trim((string)$channel->image->url);
73-
}
74-
$items = $xml->channel[0]->item;
75-
foreach ($items as $feedItem) {
76-
$item = [
77-
'title' => (string) $feedItem->title,
78-
'uri' => (string) $feedItem->link,
79-
];
80-
65+
$feedParser = new FeedParser();
66+
$feed = $feedParser->parseFeed(getContents($this->getURI()));
8167

68+
foreach ($feed['items'] as $item) {
8269
$item['id'] = str_replace(['https://nyaa.si/download/', '.torrent'], '', $item['uri']);
83-
84-
$nyaaNamespace = (array)($feedItem->children('nyaa', true));
85-
$item = array_merge($item, $nyaaNamespace);
86-
87-
// Convert URI from torrent file to web page
8870
$item['uri'] = str_replace('/download/', '/view/', $item['uri']);
8971
$item['uri'] = str_replace('.torrent', '', $item['uri']);
90-
91-
$item_html = getSimpleHTMLDOMCached($item['uri']);
92-
if ($item_html) {
93-
// Retrieve full description from page contents
94-
$item_desc = str_get_html(
95-
markdownToHtml(html_entity_decode($item_html->find('#torrent-description', 0)->innertext))
96-
);
97-
98-
// Retrieve image for thumbnail or generic logo fallback
72+
$dom = getSimpleHTMLDOMCached($item['uri']);
73+
if ($dom) {
74+
$description = $dom->find('#torrent-description', 0)->innertext ?? '';
75+
$itemDom = str_get_html(markdownToHtml(html_entity_decode($description)));
9976
$item_image = $this->getURI() . 'static/img/avatar/default.png';
100-
foreach ($item_desc->find('img') as $img) {
77+
foreach ($itemDom->find('img') as $img) {
10178
if (strpos($img->src, 'prez') === false) {
10279
$item_image = $img->src;
10380
break;
10481
}
10582
}
106-
10783
$item['enclosures'] = [$item_image];
108-
$item['content'] = $item_desc;
84+
$item['content'] = (string) $itemDom;
10985
}
110-
11186
$this->items[] = $item;
11287
if (count($this->items) >= 10) {
11388
break;

composer.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
"ext-openssl": "*",
2929
"ext-libxml": "*",
3030
"ext-simplexml": "*",
31+
"ext-dom": "*",
3132
"ext-json": "*"
3233
},
3334
"require-dev": {
@@ -38,8 +39,7 @@
3839
"ext-memcached": "Allows to use memcached as cache type",
3940
"ext-sqlite3": "Allows to use an SQLite database for caching",
4041
"ext-zip": "Required for FDroidRepoBridge",
41-
"ext-intl": "Required for OLXBridge",
42-
"ext-dom": "Allows to use some bridges based on XPath expressions"
42+
"ext-intl": "Required for OLXBridge"
4343
},
4444
"autoload-dev": {
4545
"psr-4": {

formats/AtomFormat.php

+16-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ class AtomFormat extends FormatAbstract
1616

1717
public function stringify()
1818
{
19+
$document = new \DomDocument('1.0', $this->getCharset());
20+
1921
$feedUrl = get_current_url();
2022

2123
$extraInfos = $this->getExtraInfos();
@@ -25,7 +27,6 @@ public function stringify()
2527
$uri = $extraInfos['uri'];
2628
}
2729

28-
$document = new \DomDocument('1.0', $this->getCharset());
2930
$document->formatOutput = true;
3031
$feed = $document->createElementNS(self::ATOM_NS, 'feed');
3132
$document->appendChild($feed);
@@ -81,6 +82,7 @@ public function stringify()
8182
$linkSelf->setAttribute('href', $feedUrl);
8283

8384
foreach ($this->getItems() as $item) {
85+
$itemArray = $item->toArray();
8486
$entryTimestamp = $item->getTimestamp();
8587
$entryTitle = $item->getTitle();
8688
$entryContent = $item->getContent();
@@ -138,7 +140,19 @@ public function stringify()
138140
$entry->appendChild($id);
139141
$id->appendChild($document->createTextNode($entryID));
140142

141-
if (!empty($entryUri)) {
143+
if (isset($itemArray['itunes'])) {
144+
$feed->setAttributeNS('http://www.w3.org/2000/xmlns/', 'xmlns:itunes', self::ITUNES_NS);
145+
foreach ($itemArray['itunes'] as $itunesKey => $itunesValue) {
146+
$itunesProperty = $document->createElementNS(self::ITUNES_NS, $itunesKey);
147+
$entry->appendChild($itunesProperty);
148+
$itunesProperty->appendChild($document->createTextNode($itunesValue));
149+
}
150+
$itunesEnclosure = $document->createElement('enclosure');
151+
$entry->appendChild($itunesEnclosure);
152+
$itunesEnclosure->setAttribute('url', $itemArray['enclosure']['url']);
153+
$itunesEnclosure->setAttribute('length', $itemArray['enclosure']['length']);
154+
$itunesEnclosure->setAttribute('type', $itemArray['enclosure']['type']);
155+
} elseif (!empty($entryUri)) {
142156
$entryLinkAlternate = $document->createElement('link');
143157
$entry->appendChild($entryLinkAlternate);
144158
$entryLinkAlternate->setAttribute('rel', 'alternate');

formats/MrssFormat.php

+22-8
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ class MrssFormat extends FormatAbstract
3434

3535
public function stringify()
3636
{
37+
$document = new \DomDocument('1.0', $this->getCharset());
38+
3739
$feedUrl = get_current_url();
3840
$extraInfos = $this->getExtraInfos();
3941
if (empty($extraInfos['uri'])) {
@@ -42,7 +44,6 @@ public function stringify()
4244
$uri = $extraInfos['uri'];
4345
}
4446

45-
$document = new \DomDocument('1.0', $this->getCharset());
4647
$document->formatOutput = true;
4748
$feed = $document->createElement('rss');
4849
$document->appendChild($feed);
@@ -99,22 +100,23 @@ public function stringify()
99100
$linkSelf->setAttribute('href', $feedUrl);
100101

101102
foreach ($this->getItems() as $item) {
103+
$itemArray = $item->toArray();
102104
$itemTimestamp = $item->getTimestamp();
103105
$itemTitle = $item->getTitle();
104106
$itemUri = $item->getURI();
105107
$itemContent = $item->getContent() ? break_annoying_html_tags($item->getContent()) : '';
106-
$entryID = $item->getUid();
108+
$itemUid = $item->getUid();
107109
$isPermaLink = 'false';
108110

109-
if (empty($entryID) && !empty($itemUri)) {
111+
if (empty($itemUid) && !empty($itemUri)) {
110112
// Fallback to provided URI
111-
$entryID = $itemUri;
113+
$itemUid = $itemUri;
112114
$isPermaLink = 'true';
113115
}
114116

115-
if (empty($entryID)) {
117+
if (empty($itemUid)) {
116118
// Fallback to title and content
117-
$entryID = hash('sha1', $itemTitle . $itemContent);
119+
$itemUid = hash('sha1', $itemTitle . $itemContent);
118120
}
119121

120122
$entry = $document->createElement('item');
@@ -126,7 +128,19 @@ public function stringify()
126128
$entryTitle->appendChild($document->createTextNode($itemTitle));
127129
}
128130

129-
if (!empty($itemUri)) {
131+
if (isset($itemArray['itunes'])) {
132+
$feed->setAttributeNS('http://www.w3.org/2000/xmlns/', 'xmlns:itunes', self::ITUNES_NS);
133+
foreach ($itemArray['itunes'] as $itunesKey => $itunesValue) {
134+
$itunesProperty = $document->createElementNS(self::ITUNES_NS, $itunesKey);
135+
$entry->appendChild($itunesProperty);
136+
$itunesProperty->appendChild($document->createTextNode($itunesValue));
137+
}
138+
$itunesEnclosure = $document->createElement('enclosure');
139+
$entry->appendChild($itunesEnclosure);
140+
$itunesEnclosure->setAttribute('url', $itemArray['enclosure']['url']);
141+
$itunesEnclosure->setAttribute('length', $itemArray['enclosure']['length']);
142+
$itunesEnclosure->setAttribute('type', $itemArray['enclosure']['type']);
143+
} if (!empty($itemUri)) {
130144
$entryLink = $document->createElement('link');
131145
$entry->appendChild($entryLink);
132146
$entryLink->appendChild($document->createTextNode($itemUri));
@@ -135,7 +149,7 @@ public function stringify()
135149
$entryGuid = $document->createElement('guid');
136150
$entryGuid->setAttribute('isPermaLink', $isPermaLink);
137151
$entry->appendChild($entryGuid);
138-
$entryGuid->appendChild($document->createTextNode($entryID));
152+
$entryGuid->appendChild($document->createTextNode($itemUid));
139153

140154
if (!empty($itemTimestamp)) {
141155
$entryPublished = $document->createElement('pubDate');

lib/FeedParser.php

+67-33
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
declare(strict_types=1);
44

55
/**
6-
* Very basic and naive feed parser that srapes out rss 0.91, 1.0, 2.0 and atom 1.0.
6+
* Very basic and naive feed parser.
77
*
8-
* Emit arrays meant to be used inside rss-bridge.
8+
* Scrapes out rss 0.91, 1.0, 2.0 and atom 1.0.
99
*
10-
* The feed item structure is identical to that of FeedItem
10+
* Produce arrays meant to be used inside rss-bridge.
11+
*
12+
* The item structure is tweaked so that works with FeedItem
1113
*/
1214
final class FeedParser
1315
{
@@ -85,9 +87,7 @@ public function parseFeed(string $xmlString): array
8587

8688
public function parseAtomItem(\SimpleXMLElement $feedItem): array
8789
{
88-
// Some ATOM entries also contain RSS 2.0 fields
8990
$item = $this->parseRss2Item($feedItem);
90-
9191
if (isset($feedItem->id)) {
9292
$item['uri'] = (string)$feedItem->id;
9393
}
@@ -131,16 +131,60 @@ public function parseAtomItem(\SimpleXMLElement $feedItem): array
131131

132132
public function parseRss2Item(\SimpleXMLElement $feedItem): array
133133
{
134-
// Primary data is compatible to 0.91 with some additional data
135-
$item = $this->parseRss091Item($feedItem);
134+
$item = [
135+
'uri' => '',
136+
'title' => '',
137+
'content' => '',
138+
'timestamp' => '',
139+
'author' => '',
140+
//'uid' => null,
141+
//'categories' => [],
142+
//'enclosures' => [],
143+
];
144+
145+
foreach ($feedItem as $k => $v) {
146+
$hasChildren = count($v) !== 0;
147+
if (!$hasChildren) {
148+
$item[$k] = (string) $v;
149+
}
150+
}
151+
152+
if (isset($feedItem->link)) {
153+
// todo: trim uri
154+
$item['uri'] = (string)$feedItem->link;
155+
}
156+
if (isset($feedItem->title)) {
157+
$item['title'] = html_entity_decode((string)$feedItem->title);
158+
}
159+
if (isset($feedItem->description)) {
160+
$item['content'] = (string)$feedItem->description;
161+
}
162+
136163
$namespaces = $feedItem->getNamespaces(true);
137164
if (isset($namespaces['dc'])) {
138165
$dc = $feedItem->children($namespaces['dc']);
139166
}
140167
if (isset($namespaces['media'])) {
141168
$media = $feedItem->children($namespaces['media']);
142169
}
143-
170+
foreach ($namespaces as $namespaceName => $namespaceUrl) {
171+
if (in_array($namespaceName, ['', 'content', 'media'])) {
172+
continue;
173+
}
174+
$module = $feedItem->children($namespaceUrl);
175+
$item[$namespaceName] = [];
176+
foreach ($module as $moduleKey => $moduleValue) {
177+
$item[$namespaceName][$moduleKey] = (string) $moduleValue;
178+
}
179+
}
180+
if (isset($namespaces['itunes'])) {
181+
$enclosure = $feedItem->enclosure;
182+
$item['enclosure'] = [
183+
'url' => (string) $enclosure['url'],
184+
'length' => (string) $enclosure['length'],
185+
'type' => (string) $enclosure['type'],
186+
];
187+
}
144188
if (isset($feedItem->guid)) {
145189
// Pluck out a url from guid
146190
foreach ($feedItem->guid->attributes() as $attribute => $value) {
@@ -184,30 +228,13 @@ public function parseRss2Item(\SimpleXMLElement $feedItem): array
184228
}
185229

186230
public function parseRss1Item(\SimpleXMLElement $feedItem): array
187-
{
188-
// 1.0 adds optional elements around the 0.91 standard
189-
$item = $this->parseRss091Item($feedItem);
190-
$namespaces = $feedItem->getNamespaces(true);
191-
if (isset($namespaces['dc'])) {
192-
$dc = $feedItem->children($namespaces['dc']);
193-
if (isset($dc->date)) {
194-
$item['timestamp'] = strtotime((string)$dc->date);
195-
}
196-
if (isset($dc->creator)) {
197-
$item['author'] = (string)$dc->creator;
198-
}
199-
}
200-
return $item;
201-
}
202-
203-
public function parseRss091Item(\SimpleXMLElement $feedItem): array
204231
{
205232
$item = [
206-
'uri' => null,
207-
'title' => null,
208-
'content' => null,
209-
'timestamp' => null,
210-
'author' => null,
233+
'uri' => '',
234+
'title' => '',
235+
'content' => '',
236+
'timestamp' => '',
237+
'author' => '',
211238
//'uid' => null,
212239
//'categories' => [],
213240
//'enclosures' => [],
@@ -219,12 +246,19 @@ public function parseRss091Item(\SimpleXMLElement $feedItem): array
219246
if (isset($feedItem->title)) {
220247
$item['title'] = html_entity_decode((string)$feedItem->title);
221248
}
222-
// rss 0.91 doesn't support timestamps
223-
// rss 0.91 doesn't support authors
224-
// rss 0.91 doesn't support enclosures
225249
if (isset($feedItem->description)) {
226250
$item['content'] = (string)$feedItem->description;
227251
}
252+
$namespaces = $feedItem->getNamespaces(true);
253+
if (isset($namespaces['dc'])) {
254+
$dc = $feedItem->children($namespaces['dc']);
255+
if (isset($dc->date)) {
256+
$item['timestamp'] = strtotime((string)$dc->date);
257+
}
258+
if (isset($dc->creator)) {
259+
$item['author'] = (string)$dc->creator;
260+
}
261+
}
228262
return $item;
229263
}
230264
}

lib/FormatAbstract.php

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
abstract class FormatAbstract
44
{
5+
public const ITUNES_NS = 'http://www.itunes.com/dtds/podcast-1.0.dtd';
6+
57
const MIME_TYPE = 'text/plain';
68

79
protected string $charset = 'UTF-8';

0 commit comments

Comments
 (0)