3
3
declare (strict_types=1 );
4
4
5
5
/**
6
- * Very basic and naive feed parser that srapes out rss 0.91, 1.0, 2.0 and atom 1.0 .
6
+ * Very basic and naive feed parser.
7
7
*
8
- * Emit arrays meant to be used inside rss-bridge .
8
+ * Scrapes out rss 0.91, 1.0, 2.0 and atom 1.0 .
9
9
*
10
- * The feed item structure is identical to that of FeedItem
10
+ * Produce arrays meant to be used inside rss-bridge.
11
+ *
12
+ * The item structure is tweaked so that works with FeedItem
11
13
*/
12
14
final class FeedParser
13
15
{
@@ -85,9 +87,7 @@ public function parseFeed(string $xmlString): array
85
87
86
88
public function parseAtomItem (\SimpleXMLElement $ feedItem ): array
87
89
{
88
- // Some ATOM entries also contain RSS 2.0 fields
89
90
$ item = $ this ->parseRss2Item ($ feedItem );
90
-
91
91
if (isset ($ feedItem ->id )) {
92
92
$ item ['uri ' ] = (string )$ feedItem ->id ;
93
93
}
@@ -131,16 +131,60 @@ public function parseAtomItem(\SimpleXMLElement $feedItem): array
131
131
132
132
public function parseRss2Item (\SimpleXMLElement $ feedItem ): array
133
133
{
134
- // Primary data is compatible to 0.91 with some additional data
135
- $ item = $ this ->parseRss091Item ($ feedItem );
134
+ $ item = [
135
+ 'uri ' => '' ,
136
+ 'title ' => '' ,
137
+ 'content ' => '' ,
138
+ 'timestamp ' => '' ,
139
+ 'author ' => '' ,
140
+ //'uid' => null,
141
+ //'categories' => [],
142
+ //'enclosures' => [],
143
+ ];
144
+
145
+ foreach ($ feedItem as $ k => $ v ) {
146
+ $ hasChildren = count ($ v ) !== 0 ;
147
+ if (!$ hasChildren ) {
148
+ $ item [$ k ] = (string ) $ v ;
149
+ }
150
+ }
151
+
152
+ if (isset ($ feedItem ->link )) {
153
+ // todo: trim uri
154
+ $ item ['uri ' ] = (string )$ feedItem ->link ;
155
+ }
156
+ if (isset ($ feedItem ->title )) {
157
+ $ item ['title ' ] = html_entity_decode ((string )$ feedItem ->title );
158
+ }
159
+ if (isset ($ feedItem ->description )) {
160
+ $ item ['content ' ] = (string )$ feedItem ->description ;
161
+ }
162
+
136
163
$ namespaces = $ feedItem ->getNamespaces (true );
137
164
if (isset ($ namespaces ['dc ' ])) {
138
165
$ dc = $ feedItem ->children ($ namespaces ['dc ' ]);
139
166
}
140
167
if (isset ($ namespaces ['media ' ])) {
141
168
$ media = $ feedItem ->children ($ namespaces ['media ' ]);
142
169
}
143
-
170
+ foreach ($ namespaces as $ namespaceName => $ namespaceUrl ) {
171
+ if (in_array ($ namespaceName , ['' , 'content ' , 'media ' ])) {
172
+ continue ;
173
+ }
174
+ $ module = $ feedItem ->children ($ namespaceUrl );
175
+ $ item [$ namespaceName ] = [];
176
+ foreach ($ module as $ moduleKey => $ moduleValue ) {
177
+ $ item [$ namespaceName ][$ moduleKey ] = (string ) $ moduleValue ;
178
+ }
179
+ }
180
+ if (isset ($ namespaces ['itunes ' ])) {
181
+ $ enclosure = $ feedItem ->enclosure ;
182
+ $ item ['enclosure ' ] = [
183
+ 'url ' => (string ) $ enclosure ['url ' ],
184
+ 'length ' => (string ) $ enclosure ['length ' ],
185
+ 'type ' => (string ) $ enclosure ['type ' ],
186
+ ];
187
+ }
144
188
if (isset ($ feedItem ->guid )) {
145
189
// Pluck out a url from guid
146
190
foreach ($ feedItem ->guid ->attributes () as $ attribute => $ value ) {
@@ -184,30 +228,13 @@ public function parseRss2Item(\SimpleXMLElement $feedItem): array
184
228
}
185
229
186
230
public function parseRss1Item (\SimpleXMLElement $ feedItem ): array
187
- {
188
- // 1.0 adds optional elements around the 0.91 standard
189
- $ item = $ this ->parseRss091Item ($ feedItem );
190
- $ namespaces = $ feedItem ->getNamespaces (true );
191
- if (isset ($ namespaces ['dc ' ])) {
192
- $ dc = $ feedItem ->children ($ namespaces ['dc ' ]);
193
- if (isset ($ dc ->date )) {
194
- $ item ['timestamp ' ] = strtotime ((string )$ dc ->date );
195
- }
196
- if (isset ($ dc ->creator )) {
197
- $ item ['author ' ] = (string )$ dc ->creator ;
198
- }
199
- }
200
- return $ item ;
201
- }
202
-
203
- public function parseRss091Item (\SimpleXMLElement $ feedItem ): array
204
231
{
205
232
$ item = [
206
- 'uri ' => null ,
207
- 'title ' => null ,
208
- 'content ' => null ,
209
- 'timestamp ' => null ,
210
- 'author ' => null ,
233
+ 'uri ' => '' ,
234
+ 'title ' => '' ,
235
+ 'content ' => '' ,
236
+ 'timestamp ' => '' ,
237
+ 'author ' => '' ,
211
238
//'uid' => null,
212
239
//'categories' => [],
213
240
//'enclosures' => [],
@@ -219,12 +246,19 @@ public function parseRss091Item(\SimpleXMLElement $feedItem): array
219
246
if (isset ($ feedItem ->title )) {
220
247
$ item ['title ' ] = html_entity_decode ((string )$ feedItem ->title );
221
248
}
222
- // rss 0.91 doesn't support timestamps
223
- // rss 0.91 doesn't support authors
224
- // rss 0.91 doesn't support enclosures
225
249
if (isset ($ feedItem ->description )) {
226
250
$ item ['content ' ] = (string )$ feedItem ->description ;
227
251
}
252
+ $ namespaces = $ feedItem ->getNamespaces (true );
253
+ if (isset ($ namespaces ['dc ' ])) {
254
+ $ dc = $ feedItem ->children ($ namespaces ['dc ' ]);
255
+ if (isset ($ dc ->date )) {
256
+ $ item ['timestamp ' ] = strtotime ((string )$ dc ->date );
257
+ }
258
+ if (isset ($ dc ->creator )) {
259
+ $ item ['author ' ] = (string )$ dc ->creator ;
260
+ }
261
+ }
228
262
return $ item ;
229
263
}
230
264
}
0 commit comments