|
| 1 | +<?php |
| 2 | + |
| 3 | +declare(strict_types=1); |
| 4 | + |
| 5 | +namespace CodeQ\LinkChecker\Command; |
| 6 | + |
| 7 | +use CodeQ\LinkChecker\Domain\Crawler\ContentNodeCrawler; |
| 8 | +use CodeQ\LinkChecker\Domain\Service\DomainService; |
| 9 | +use CodeQ\LinkChecker\Profile\CheckAllLinks; |
| 10 | +use CodeQ\LinkChecker\Reporter\LogBrokenLinks; |
| 11 | +use CodeQ\LinkChecker\Service\NotificationServiceInterface; |
| 12 | +use GuzzleHttp\RequestOptions; |
| 13 | +use Neos\ContentRepository\Domain\Service\ContextFactoryInterface; |
| 14 | +use Neos\Flow\Annotations as Flow; |
| 15 | +use Neos\Flow\Cli\CommandController; |
| 16 | +use Neos\Flow\I18n\Translator; |
| 17 | +use Neos\Flow\Mvc\Exception\InvalidActionNameException; |
| 18 | +use Neos\Flow\Mvc\Exception\InvalidArgumentNameException; |
| 19 | +use Neos\Flow\Mvc\Exception\InvalidArgumentTypeException; |
| 20 | +use Neos\Flow\Mvc\Exception\InvalidControllerNameException; |
| 21 | +use Neos\Flow\Mvc\Routing\Exception\MissingActionNameException; |
| 22 | +use Neos\Flow\ObjectManagement\Exception\UnresolvedDependenciesException; |
| 23 | +use Neos\Flow\Persistence\Exception\IllegalObjectTypeException; |
| 24 | +use Neos\Flow\Persistence\Exception\InvalidQueryException; |
| 25 | +use Neos\Neos\Domain\Model\Domain; |
| 26 | +use Spatie\Crawler\Crawler; |
| 27 | + |
| 28 | +/** |
| 29 | + * @Flow\Scope("singleton") |
| 30 | + * @see https://gist.github.com/hhoechtl/9012d455eab52658bbf4 |
| 31 | + */ |
| 32 | +class CheckLinksCommandController extends CommandController |
| 33 | +{ |
| 34 | + public const MIN_STATUS_CODE = 404; |
| 35 | + |
| 36 | + /** |
| 37 | + * @var Translator |
| 38 | + * @Flow\Inject |
| 39 | + */ |
| 40 | + protected $translator; |
| 41 | + |
| 42 | + /** |
| 43 | + * @var DomainService |
| 44 | + * @Flow\Inject |
| 45 | + */ |
| 46 | + protected $domainService; |
| 47 | + |
| 48 | + /** |
| 49 | + * @var ContextFactoryInterface |
| 50 | + * @Flow\Inject |
| 51 | + */ |
| 52 | + protected $contextFactory; |
| 53 | + |
| 54 | + /** |
| 55 | + * @var ContentNodeCrawler |
| 56 | + * @Flow\Inject |
| 57 | + */ |
| 58 | + protected $contentNodeCrawler; |
| 59 | + |
| 60 | + /** |
| 61 | + * @var string |
| 62 | + * @Flow\InjectConfiguration(path="notifications.service") |
| 63 | + */ |
| 64 | + protected $notificationServiceClass; |
| 65 | + |
| 66 | + protected array $settings; |
| 67 | + |
| 68 | + /** |
| 69 | + * Inject the settings |
| 70 | + */ |
| 71 | + public function injectSettings(array $settings): void |
| 72 | + { |
| 73 | + $this->settings = $settings; |
| 74 | + } |
| 75 | + |
| 76 | + /** |
| 77 | + * Crawl for invalid node links and external links |
| 78 | + * |
| 79 | + * @throws \Neos\Flow\Security\Exception |
| 80 | + * @throws UnresolvedDependenciesException |
| 81 | + * @throws InvalidArgumentTypeException |
| 82 | + * @throws \Neos\Eel\Exception |
| 83 | + * @throws \Neos\Flow\Property\Exception |
| 84 | + * @throws InvalidArgumentNameException |
| 85 | + * @throws \Neos\Neos\Exception |
| 86 | + * @throws MissingActionNameException |
| 87 | + * @throws IllegalObjectTypeException |
| 88 | + * @throws InvalidActionNameException |
| 89 | + * @throws InvalidQueryException |
| 90 | + * @throws InvalidControllerNameException |
| 91 | + */ |
| 92 | + public function crawlCommand(): void |
| 93 | + { |
| 94 | + $this->crawlNodesCommand(); |
| 95 | + $this->crawlExternalLinksCommand(); |
| 96 | + } |
| 97 | + |
| 98 | + /** |
| 99 | + * Crawl for invalid external links |
| 100 | + * |
| 101 | + * This command crawls the whole website for invalid external links |
| 102 | + */ |
| 103 | + public function crawlExternalLinksCommand(): void |
| 104 | + { |
| 105 | + $crawlProfile = new CheckAllLinks(); |
| 106 | + $crawlObserver = new LogBrokenLinks(); |
| 107 | + $clientOptions = $this->getClientOptions(); |
| 108 | + |
| 109 | + $crawler = Crawler::create($clientOptions) |
| 110 | + ->setConcurrency($this->getConcurrency()) |
| 111 | + ->setCrawlObserver($crawlObserver) |
| 112 | + ->setCrawlProfile($crawlProfile); |
| 113 | + |
| 114 | + if ($this->shouldIgnoreRobots()) { |
| 115 | + $crawler->ignoreRobots(); |
| 116 | + } |
| 117 | + |
| 118 | + /** @var non-empty-string[] $urlsToCrawl */ |
| 119 | + $urlsToCrawl = $this->settings['urlsToCrawl']; |
| 120 | + |
| 121 | + foreach ($urlsToCrawl as $url) { |
| 122 | + try { |
| 123 | + $this->outputLine("Start scanning {$url}"); |
| 124 | + $this->outputLine(''); |
| 125 | + |
| 126 | + $crawler->startCrawling($url); |
| 127 | + |
| 128 | + if ($this->settings['notifications']['enabled'] ?? false) { |
| 129 | + $this->sendNotification($crawlObserver->getResultItemsGroupedByStatusCode()); |
| 130 | + } |
| 131 | + } catch (\InvalidArgumentException $exception) { |
| 132 | + $this->outputLine('ERROR: ' . $exception->getMessage()); |
| 133 | + } |
| 134 | + } |
| 135 | + } |
| 136 | + |
| 137 | + /** |
| 138 | + * Get client options for the guzzle client from the settings. If no settings are configured we just set |
| 139 | + * timeout and allow_redirect. |
| 140 | + * |
| 141 | + * @return array |
| 142 | + */ |
| 143 | + protected function getClientOptions(): array |
| 144 | + { |
| 145 | + $clientOptions = [ |
| 146 | + RequestOptions::TIMEOUT => 100, |
| 147 | + RequestOptions::ALLOW_REDIRECTS => false, |
| 148 | + ]; |
| 149 | + |
| 150 | + $optionsSettings = $this->settings['clientOptions'] ?? []; |
| 151 | + if (isset($optionsSettings['cookies']) && is_bool($optionsSettings['cookies'])) { |
| 152 | + $clientOptions[RequestOptions::COOKIES] = $optionsSettings['cookies']; |
| 153 | + } |
| 154 | + |
| 155 | + if (isset($optionsSettings['connectionTimeout']) && is_numeric($optionsSettings['connectionTimeout'])) { |
| 156 | + $clientOptions[RequestOptions::CONNECT_TIMEOUT] = $optionsSettings['connectionTimeout']; |
| 157 | + } |
| 158 | + |
| 159 | + if (isset($optionsSettings['timeout']) && is_numeric($optionsSettings['timeout'])) { |
| 160 | + $clientOptions[RequestOptions::TIMEOUT] = $optionsSettings['timeout']; |
| 161 | + } |
| 162 | + |
| 163 | + if (isset($optionsSettings['allowRedirects']) && is_bool($optionsSettings['allowRedirects'])) { |
| 164 | + $clientOptions[RequestOptions::ALLOW_REDIRECTS] = $optionsSettings['allowRedirects']; |
| 165 | + } |
| 166 | + |
| 167 | + if ( |
| 168 | + isset($optionsSettings['auth']) && is_array($optionsSettings['auth']) |
| 169 | + && count($optionsSettings['auth']) > 1 |
| 170 | + ) { |
| 171 | + $clientOptions[RequestOptions::AUTH] = $optionsSettings['auth']; |
| 172 | + } |
| 173 | + |
| 174 | + return $clientOptions; |
| 175 | + } |
| 176 | + |
| 177 | + /** |
| 178 | + * Returns concurrency. If not found, simply returns a default value like |
| 179 | + * 10 (default). |
| 180 | + */ |
| 181 | + protected function getConcurrency(): int |
| 182 | + { |
| 183 | + if (isset($this->settings['concurrency']) && (int)$this->settings['concurrency'] >= 0) { |
| 184 | + return (int)$this->settings['concurrency']; |
| 185 | + } |
| 186 | + |
| 187 | + return 10; |
| 188 | + } |
| 189 | + |
| 190 | + /** |
| 191 | + * Returns true by default and can be changed by the setting ignoreRobots |
| 192 | + */ |
| 193 | + protected function shouldIgnoreRobots(): bool |
| 194 | + { |
| 195 | + return !isset($this->settings['ignoreRobots']) || (bool)$this->settings['ignoreRobots']; |
| 196 | + } |
| 197 | + |
| 198 | + /** |
| 199 | + * Send notification about the result of the link check run. The notification service can be configured. |
| 200 | + * Default is the emailService. |
| 201 | + */ |
| 202 | + protected function sendNotification(array $results): void |
| 203 | + { |
| 204 | + $notificationServiceClass = trim($this->notificationServiceClass); |
| 205 | + if ($notificationServiceClass === '') { |
| 206 | + $errorMessage = 'No notification service has been configured, but the notification handling is enabled'; |
| 207 | + throw new \InvalidArgumentException($errorMessage, 1540201992); |
| 208 | + } |
| 209 | + |
| 210 | + $minimumStatusCode = $this->settings['notifications']['minimumStatusCode'] ?? self::MIN_STATUS_CODE; |
| 211 | + $arguments = []; |
| 212 | + foreach ($results as $statusCode => $urls) { |
| 213 | + if ($statusCode < (int)$minimumStatusCode) { |
| 214 | + continue; |
| 215 | + } |
| 216 | + |
| 217 | + $arguments['result'][$statusCode] = [ |
| 218 | + 'statusCode' => $statusCode, |
| 219 | + 'urls' => $urls, |
| 220 | + 'amount' => count($urls) |
| 221 | + ]; |
| 222 | + } |
| 223 | + |
| 224 | + /** @var NotificationServiceInterface $notificationService */ |
| 225 | + $notificationService = $this->objectManager->get($notificationServiceClass); |
| 226 | + $notificationService->sendNotification($this->settings['notifications']['subject'] ?? '', $arguments); |
| 227 | + } |
| 228 | + |
| 229 | + /** |
| 230 | + * Crawl for invalid links within nodes |
| 231 | + * |
| 232 | + * This command crawls an url for invalid internal and external links |
| 233 | + * |
| 234 | + * @throws IllegalObjectTypeException |
| 235 | + * @throws InvalidActionNameException |
| 236 | + * @throws InvalidArgumentNameException |
| 237 | + * @throws InvalidArgumentTypeException |
| 238 | + * @throws InvalidControllerNameException |
| 239 | + * @throws InvalidQueryException |
| 240 | + * @throws MissingActionNameException |
| 241 | + * @throws UnresolvedDependenciesException |
| 242 | + * @throws \Neos\Eel\Exception |
| 243 | + * @throws \Neos\Flow\Property\Exception |
| 244 | + * @throws \Neos\Flow\Security\Exception |
| 245 | + * @throws \Neos\Neos\Exception |
| 246 | + */ |
| 247 | + public function crawlNodesCommand(): void |
| 248 | + { |
| 249 | + /** @var non-empty-string[] $urlsToCrawl */ |
| 250 | + $urlsToCrawl = $this->settings['urlsToCrawl']; |
| 251 | + |
| 252 | + $domainsToCrawl = $this->domainService->getDomainsToCrawl($urlsToCrawl); |
| 253 | + |
| 254 | + if (count($domainsToCrawl) === 0) { |
| 255 | + $message = $this->translator->translatebyid('noDomainsFound', [], null, null, 'Modules', 'CodeQ.LinkChecker'); |
| 256 | + $this->output->outputFormatted('<error>' . $message . '</error>'); |
| 257 | + } |
| 258 | + |
| 259 | + foreach ($domainsToCrawl as $domainToCrawl) { |
| 260 | + $this->crawlDomain($domainToCrawl); |
| 261 | + } |
| 262 | + } |
| 263 | + |
| 264 | + /** |
| 265 | + * @throws IllegalObjectTypeException |
| 266 | + * @throws InvalidActionNameException |
| 267 | + * @throws InvalidArgumentNameException |
| 268 | + * @throws InvalidArgumentTypeException |
| 269 | + * @throws InvalidControllerNameException |
| 270 | + * @throws MissingActionNameException |
| 271 | + * @throws UnresolvedDependenciesException |
| 272 | + * @throws \Neos\Eel\Exception |
| 273 | + * @throws \Neos\Flow\Property\Exception |
| 274 | + * @throws \Neos\Flow\Security\Exception |
| 275 | + * @throws \Neos\Neos\Exception |
| 276 | + */ |
| 277 | + protected function crawlDomain(Domain $domain): void |
| 278 | + { |
| 279 | + $context = $this->contextFactory->create([ |
| 280 | + 'currentSite' => $domain->getSite(), |
| 281 | + 'currentDomain' => $domain, |
| 282 | + ]); |
| 283 | + |
| 284 | + $messages = $this->contentNodeCrawler->crawl($context, $domain); |
| 285 | + |
| 286 | + foreach ($messages as $message) { |
| 287 | + $this->output->outputFormatted('<error>' . $message . '</error>'); |
| 288 | + } |
| 289 | + } |
| 290 | +} |
0 commit comments