Skip to content

Commit

Permalink
[FEATURE] Add flushed and edited pages to crawler queue (#271)
Browse files Browse the repository at this point in the history
  • Loading branch information
tomasnorre authored Aug 1, 2020
1 parent 7a98c98 commit 0d88d0a
Show file tree
Hide file tree
Showing 12 changed files with 229 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
### Added
* Adds light red background color + icons to crawler log rows with errors
* Crawler processing by page priority
* Automatically adding pages being edited or a page caches is cleared to the crawler queue

## Crawler 9.0.3
Crawler 9.0.2 was released on July 20th, 2020
Expand Down
7 changes: 6 additions & 1 deletion Classes/Api/CrawlerApi.php
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,12 @@ public function addPageToQueueTimed($uid, $time): void
$time = intval($time);

$crawler = $this->findCrawler();
$pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
/**
* Todo: Switch back to getPage(); when dropping support for TYPO3 9 LTS - TNM
* This switch to getPage_noCheck() is needed as TYPO3 9 LTS doesn't return dokType < 200, therefore automatically
* adding pages to crawler queue when editing page-titles from the page tree directly was not working.
*/
$pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage_noCheck($uid, true);
$configurations = $crawler->getUrlsForPageRow($pageData);
$configurations = $this->filterUnallowedConfigurations($configurations);
$downloadUrls = [];
Expand Down
56 changes: 56 additions & 0 deletions Classes/Hooks/DataHandlerHook.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
<?php

declare(strict_types=1);

namespace AOE\Crawler\Hooks;

/*
* (c) 2020 AOE GmbH <[email protected]>
*
* This file is part of the TYPO3 Crawler Extension.
*
* It is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, either version 2
* of the License, or any later version.
*
* For the full copyright and license information, please read the
* LICENSE.txt file that was distributed with this source code.
*
* The TYPO3 project - inspiring people to share!
*/

use AOE\Crawler\Api\CrawlerApi;
use AOE\Crawler\Domain\Repository\QueueRepository;
use TYPO3\CMS\Core\Utility\GeneralUtility;
use TYPO3\CMS\Extbase\Object\ObjectManager;

class DataHandlerHook
{
public function addFlushedPagesToCrawlerQueue(array $parameters, \TYPO3\CMS\Core\DataHandling\DataHandler $dataHandler): void
{
$pageIdsToBeFlushedFromCache = $parameters['pageIdArray'];
if (empty($pageIdsToBeFlushedFromCache)) {
return;
}
foreach ($pageIdsToBeFlushedFromCache as $pageId) {
$pageId = (int) $pageId;
if ($pageId < 1) {
continue;
}
if ($this->getQueueRepository()->isPageInQueue($pageId)) {
continue;
}
$this->getCrawlerApi()->addPageToQueue($pageId);
}
}

private function getQueueRepository(): QueueRepository
{
return GeneralUtility::makeInstance(ObjectManager::class)->get(QueueRepository::class);
}

private function getCrawlerApi(): CrawlerApi
{
return GeneralUtility::makeInstance(ObjectManager::class)->get(CrawlerApi::class);
}
}
12 changes: 12 additions & 0 deletions Classes/Utility/HookUtility.php
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,17 @@ public static function registerHooks($extKey): void
// Activating refresh hooks
$GLOBALS['TYPO3_CONF_VARS']['EXTCONF'][$extKey]['refresh_hooks'][] =
ProcessCleanUpHook::class;

// Env-dependent
if (TYPO3_MODE === 'BE') {
self::registerBackendHooks($extKey);
}
}

private static function registerBackendHooks(string $extKey): void
{
// DataHandler clear page cache pre-processing
$GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tcemain.php']['clearPageCacheEval'][] =
"AOE\Crawler\Hooks\DataHandlerHook->addFlushedPagesToCrawlerQueue";
}
}
38 changes: 38 additions & 0 deletions Documentation/Features/AutomaticAddPagesToQueue/Index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
.. include:: /Includes.txt

============================
Automatic add pages to Queue
============================

Since 9.1.0

Edit Pages
----------

With this feature, you will automatically add pages to the crawler queue
when you are editing content on the page, unless it's within a workspace, then
it will not be added to the queue before it's published.

This functionality gives you the advantages that you would not need to keep track
of which pages you have edited, it will automatically be handle on next crawler
process task, see :ref:`executing-the-queue-label`. This ensure that
your cache or e.g. Search Index is always up to date and the end-users will see
the most current content as soon as possible.

Clear Page Single Cache
-----------------------

As the edit and clear page cache function is using the same dataHandler hooks,
we have an additional feature for free. When you clear the page cache for a specific
page then it will also be added automatically to the crawler queue. Again this will
be processed during the next crawler process.

.. figure:: /Images/backend_clear_cache.png
:alt: Clearing the page cache

Clearing the page cache

.. figure:: /Images/backend_clear_cache_queue.png
:alt: Page is added to the crawler queue

Page is added to the crawler queue
3 changes: 1 addition & 2 deletions Documentation/Features/Hooks/Index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,4 @@ Example::

return false;
}
}

}
1 change: 1 addition & 0 deletions Documentation/Features/Index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Features
:titlesonly:
:glob:

AutomaticAddPagesToQueue/Index
PollableProcessingInstructions/Index
MultiprocessSupport/Index
Hooks/Index
Expand Down
Binary file added Documentation/Images/backend_clear_cache.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
112 changes: 112 additions & 0 deletions Tests/Unit/Hooks/DataHandlerHookTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
<?php

declare(strict_types=1);

namespace AOE\Crawler\Tests\Unit\Hooks;

/*
* (c) 2020 AOE GmbH <[email protected]>
*
* This file is part of the TYPO3 Crawler Extension.
*
* It is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, either version 2
* of the License, or any later version.
*
* For the full copyright and license information, please read the
* LICENSE.txt file that was distributed with this source code.
*
* The TYPO3 project - inspiring people to share!
*/

use AOE\Crawler\Api\CrawlerApi;
use AOE\Crawler\Domain\Repository\QueueRepository;
use AOE\Crawler\Hooks\DataHandlerHook;
use Nimut\TestingFramework\TestCase\UnitTestCase;
use Prophecy\Argument;
use TYPO3\CMS\Core\Cache\CacheManager;
use TYPO3\CMS\Core\Cache\Frontend\FrontendInterface;
use TYPO3\CMS\Core\DataHandling\DataHandler;
use TYPO3\CMS\Core\Utility\GeneralUtility;
use TYPO3\CMS\Extbase\Object\ObjectManager;

class DataHandlerHookTest extends UnitTestCase
{
/**
* Page with ID 1 is not in queue, should be added
* Page with ID 2 is already in queue. Should NOT be added.
*
* @test
*/
public function itShouldAddPageToQueue(): void
{
$dataHandlerHook = new DataHandlerHook();

$crawlerApi = $this->prophesize(CrawlerApi::class);
$crawlerApi->addPageToQueue(1)->shouldBeCalled();

$queueRepository = $this->prophesize(QueueRepository::class);
$queueRepository->isPageInQueue(1)->willReturn(false);
$queueRepository->isPageInQueue(2)->willReturn(true);

$objectManager = $this->prophesize(ObjectManager::class);
$objectManager->get(QueueRepository::class)->willReturn($queueRepository->reveal());
$objectManager->get(CrawlerApi::class)->willReturn($crawlerApi->reveal());

$cacheManager = $this->prophesize(CacheManager::class);
$cacheManager->getCache(Argument::any())->willReturn($this->prophesize(FrontendInterface::class)->reveal());

GeneralUtility::setSingletonInstance(CacheManager::class, $cacheManager->reveal());
GeneralUtility::setSingletonInstance(ObjectManager::class, $objectManager->reveal());

$dataHandler = new DataHandler();

$dataHandlerHook->addFlushedPagesToCrawlerQueue(
[
'table' => 'pages',
'pageIdArray' => [0, 1, 2],
],
$dataHandler
);
}

/**
* Page with ID 1 is not in queue, should be added
* Page with ID 2 is already in queue. Should NOT be added.
* Page with ID 3 is not in queue, should be added
*
* @test
*/
public function itShouldAddPageToQueueWithMorePages(): void
{
$dataHandlerHook = new DataHandlerHook();
$crawlerApi = $this->prophesize(CrawlerApi::class);
$crawlerApi->addPageToQueue(1)->shouldBeCalled();
$crawlerApi->addPageToQueue(3)->shouldBeCalled();

$queueRepository = $this->prophesize(QueueRepository::class);
$queueRepository->isPageInQueue(1)->willReturn(false);
$queueRepository->isPageInQueue(2)->willReturn(true);
$queueRepository->isPageInQueue(3)->willReturn(false);

$objectManager = $this->prophesize(ObjectManager::class);
$objectManager->get(QueueRepository::class)->willReturn($queueRepository->reveal());
$objectManager->get(CrawlerApi::class)->willReturn($crawlerApi->reveal());

$cacheManager = $this->prophesize(CacheManager::class);
$cacheManager->getCache(Argument::any())->willReturn($this->prophesize(FrontendInterface::class)->reveal());

GeneralUtility::setSingletonInstance(CacheManager::class, $cacheManager->reveal());
GeneralUtility::setSingletonInstance(ObjectManager::class, $objectManager->reveal());

$dataHandler = new DataHandler();

$dataHandlerHook->addFlushedPagesToCrawlerQueue(
[
'table' => 'tt_content',
'pageIdArray' => [0, 1, 2, 3],
],
$dataHandler
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

declare(strict_types=1);

namespace AOE\Crawler\Tests\Unit\Hook;
namespace AOE\Crawler\Tests\Unit\Hooks;

/*
* (c) 2020 AOE GmbH <[email protected]>
Expand Down
2 changes: 1 addition & 1 deletion ext_localconf.php
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?php
defined('TYPO3_MODE') or die();

\AOE\Crawler\Utility\HookUtility::registerHooks($_EXTKEY);
\AOE\Crawler\Utility\HookUtility::registerHooks('crawler');
\AOE\Crawler\Utility\BackendUtility::registerIcons();

$GLOBALS['TYPO3_CONF_VARS']['BE']['ContextMenu']['ItemProviders'][1566472321] = \AOE\Crawler\ContextMenu\ItemProvider::class;

0 comments on commit 0d88d0a

Please sign in to comment.