Skip to content

Commit

Permalink
Merger::update() implemented and multiple other fixes
Browse files Browse the repository at this point in the history
* NamedEntityRepo tests added (and small fixed of the code)
* initilization of local ARCHE instance for testing added to the CI/CD
* Merger::merge() fixes
* Crawler::crawl() fixes
  • Loading branch information
zozlak committed Nov 27, 2024
1 parent ca145e1 commit e30aac7
Show file tree
Hide file tree
Showing 18 changed files with 1,050 additions and 89 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/30-fixVocabularies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
echo "DELETE FROM metadata WHERE property = 'https://vocabs.acdh.oeaw.ac.at/schema#vocabs' AND value ~ 'iso|oefos';" | psql
5 changes: 5 additions & 0 deletions .github/workflows/99-postgresqlConfig.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
echo "listen_addresses = '*'" >> /home/www-data/postgresql/postgresql.conf
sed -i -E 's/peer|ident|md5/trust/g' /home/www-data/postgresql/pg_hba.conf
echo "host all all 127.0.0.1/0 trust" >> /home/www-data/postgresql/pg_hba.conf
echo "host all all ::1/0 trust" >> /home/www-data/postgresql/pg_hba.conf
11 changes: 11 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,17 @@ jobs:
- name: phpstan
run: |
vendor/bin/phpstan analyze -l 6 src tests bin/*
- name: clone repo config
run: |
git clone --depth 1 --branch arche https://github.com/acdh-oeaw/arche-docker-config.git config
chmod -x config/run.d/* config/initScripts/30-dissServices.php
chmod +x config/run.d/98-xdebug.sh config/run.d/10-postgresql.sh config/run.d/15-config-yaml.sh
cp .github/workflows/99-postgresqlConfig.sh config/run.d/
cp .github/workflows/30-fixVocabularies.sh config/initScripts/
- name: run repo docker
run: |
mkdir log
docker run --name arche -p 80:80 -p 5432:5432 -v `pwd`/log:/home/www-data/log -v `pwd`/config:/home/www-data/config -e USER_UID=`id -u` -e USER_GID=`id -g` -e ADMIN_PSWD=admin -d acdhch/arche
- name: phpunit
run: |
XDEBUG_MODE=coverage vendor/bin/phpunit
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@
/*.ttl*
/.phpunit.cache
/build
/config
/log
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
},
"require-dev": {
"phpunit/phpunit": "^10",
"phpstan/phpstan": "^1"
"phpstan/phpstan": "^1",
"acdh-oeaw/arche-lib-ingest": "^4.0"
},
"autoload-dev": {
"psr-4": {
Expand Down
94 changes: 11 additions & 83 deletions src/acdhOeaw/arche/refSources/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -61,33 +61,15 @@ public function __construct(object $refSrcsCfg, Schema $schema,
*
* @return Generator<ProcessEntityResult>
*/
public function crawl(NamedEntityIteratorInterface $source,
string $dateFilter = '',
string | null $idFilter = null,
int | null $limit = null): Generator {
$processed = [];
foreach ($this->mappings->getDbNames() as $extDbName) {
$this->log?->info("### Processing resources from $extDbName");

$idFilterTmp = $idFilter ?? $this->mappings->getRule($extDbName)->match;
$source->setFilter(null, $idFilterTmp, $dateFilter, $limit);

$N = 1;
foreach ($source->getNamedEntities() as $entity) {
$NT = $source->count();
$NN = round(100 * $N / $NT);
$this->log?->info(" " . $entity->getUri() . " ($N/$NT $NN%)");
$N++;

$entityUri = $entity->getUri();
if (isset($processed[$entityUri])) {
$this->log?->info(" already processed");
continue;
}
$processed[$entity->getUri()] = 1;

yield $this->processEntity($entity);
}
public function crawl(NamedEntityIteratorInterface $source): Generator {
$this->log?->info("### Crawling source entities");
$NT = $source->count();
$N = 1;
foreach ($source->getNamedEntities() as $entity) {
$NN = round(100 * $N / $NT);
$this->log?->info($entity->getUri() . " ($N/$NT $NN%)");
$N++;
yield $this->processEntity($entity);
}
}

Expand Down Expand Up @@ -117,7 +99,7 @@ private function processEntity(NamedEntityInterface $entity): ProcessEntityResul
}
}
} catch (RefSourcesException | UriNormalizerException $e) {
$this->log?->debug(" unsupported source: " . $e->getMessage());
$this->log?->debug(" unsupported source: " . $e->getMessage());
}
}
$dbNames = $this->mappings->getDbNames();
Expand All @@ -136,60 +118,6 @@ private function processEntity(NamedEntityInterface $entity): ProcessEntityResul
}
}
// return merged and original metadata
return new ProcessEntityResult($entity, $entityMeta, $entityMetaOrig);
return new ProcessEntityResult($entityMeta, $entityMetaOrig);
}
// private function updateMetadata(Repo $repo, DatasetNodeInterface $meta,
// bool $test = true): array {
// if ($repo->inTransaction()) {
// $repo->rollback();
// }
// $schema = $repo->getSchema();
//
// // merge all matching resources
// $merged = [];
// $ids = $meta->listObjects(new PT($schema->id))->getValues();
// $st = new SearchTerm($schema->id, $ids, '=');
// $sc = new SearchConfig();
// $sc->metadataMode = RepoResource::META_RESOURCE;
// $sc->resourceProperties = [$schema->creationDate];
// try {
// $mainUri = $meta->getNode()->getValue();
// $repoResources = $repo->getResourcesBySearchTerms([$st], $sc);
// $repoResources = iterator_to_array($repoResources);
// // check if $meta's node makes sense and substitute if not
// $inRepo = array_sum(array_map(fn($x) => $x->getUri() === $mainUri, $repoResources));
// if ($inRepo === 0) {
// $mainUri = $repoResources[0]->getUri();
// }
// $mainRes = array_filter($repoResources, fn($x) => $x->getUri() === $mainUri);
// /* @var RepoResource $mainRes */
// $mainRes = reset($mainRes);
//
// // merge all matching repo resources with the main one
// $repo->begin();
// foreach ($repoResources as $repoResource) {
// /* @var RepoResource $repoResource */
// if ($repoResource->getUri() === $mainUri) {
// continue;
// }
// $merged[] = $repoResource->getUri();
// $repoResource->merge($mainUri, RepoResource::META_NONE);
// }
//
// // update the main resource
// $mainRes->setMetadata($meta);
// $mainRes->updateMetadata(RepoResource::UPDATE_MERGE);
// } catch (NotFound) {
// $repo->begin();
// $repo->createResource($meta);
// }
// if ($test) {
// $repo->rollback();
// } else {
// $repo->commit();
// }
//
// // return merge results
// return $merged;
// }
}
182 changes: 182 additions & 0 deletions src/acdhOeaw/arche/refSources/Merger.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
<?php

/*
* The MIT License
*
* Copyright 2024 zozlak.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

namespace acdhOeaw\arche\refSources;

use Psr\Log\LoggerInterface;
use GuzzleHttp\Exception\ClientException;
use rdfInterface\DatasetInterface;
use rdfInterface\QuadInterface;
use quickRdf\RdfNamespace;
use quickRdf\DatasetNode;
use quickRdfIo\Util as RdfIoUtil;
use acdhOeaw\arche\lib\Schema;
use acdhOeaw\arche\lib\SearchTerm;
use acdhOeaw\arche\lib\SearchConfig;
use acdhOeaw\arche\lib\Repo;
use acdhOeaw\arche\lib\RepoResource;
use acdhOeaw\arche\lib\exception\RepoLibException;
use termTemplates\QuadTemplate as QT;
use termTemplates\PredicateTemplate as PT;

;

/**
* Description of Merger
*
* @author zozlak
*/
class Merger {

private Repo $repo;
private Schema $schema;
private LoggerInterface | null $log;

public function __construct(Repo | Schema | null $repoOrSchema,
LoggerInterface | null $log = null) {
if ($repoOrSchema instanceof Repo) {
$this->repo = $repoOrSchema;
$this->schema = $repoOrSchema->getSchema();
} else {
$this->schema = $repoOrSchema;
}
$this->log = $log;
}

/**
* Merges subjects sharing same identifers using only the local data
*/
public function merge(DatasetInterface $data): void {
$this->log?->info("### Merging entities locally");
foreach ($data->listObjects(new PT($this->schema->id)) as $id) {
$tmpl = new PT($this->schema->id, $id);
$sbjs = iterator_to_array($data->listSubjects($tmpl));
if (count($sbjs) > 1) {
// sorting is only for stable results but completely arbitrary
// we have no way to check from which source the data of a subject with a given id come
// so we can no tell how to order subjects according to their importance
usort($sbjs, fn($a, $b) => $a->getValue() <=> $b->getValue());
$mainSbj = array_shift($sbjs);
$mainSbjMeta = $data->copy(new QT($mainSbj));
foreach ($sbjs as $sbj) {
$this->log?->info("Merging $sbj into $mainSbj");
$this->mergeInto($mainSbjMeta, $data->delete(new QT($sbj)));
}
$data->delete(new QT($mainSbj));
$data->add($mainSbjMeta);
}
}
}

/**
* Performs an update against the repository
* @param resource|null $output
*/
public function update(DatasetInterface $data, bool $test = true,
$output = null): void {
if (!isset($this->repo)) {
throw new \RuntimeException("No repository defined");
}

$nmsp = new RdfNamespace();
foreach ($this->schema->namespaces ?? [] as $alias => $prefix) {
$nmsp->add($prefix, $alias);
}

$this->log?->info("### Updating entities");
$sc = new SearchConfig();
$sc->metadataMode = RepoResource::META_IDS;
$st = new SearchTerm($this->schema->id, [], '=');
$idTmpl = new PT($this->schema->id);

foreach ($data->listSubjects() as $sbj) {
$sbjMeta = new DatasetNode($sbj, $data->getIterator(new QT($sbj)));
if ($sbjMeta->none($idTmpl)) {
$this->log?->info("Skipping $sbj - no identifiers");
continue;
}
$st->value = $sbjMeta->listObjects($idTmpl)->getValues();
$repoResources = $this->repo->getResourcesBySearchTerms([$st], $sc);
$repoResources = iterator_to_array($repoResources);

try {
$this->repo->begin();
if (count($repoResources) === 0) {
$this->log?->info("Creating resource $sbj");
$mainRes = $this->repo->createResource($sbjMeta);
} elseif (count($repoResources) === 1) {
/** @var RepoResource $mainRes */
$mainRes = reset($repoResources);
$this->log?->info("Updating metadata of " . $mainRes->getUri() . " with $sbj metadata");
$mainRes->setMetadata($sbjMeta);
$mainRes->updateMetadata(readMode: RepoResource::META_RESOURCE);
} else {
// just for stable results
usort($repoResources, fn($a, $b) => $a->getUri() <=> $b->getUri());
$mainRes = array_shift($repoResources);
$this->log?->info("Updating metadata of " . $mainRes->getUri() . " with $sbj metadata");
// skip identifiers not to cause conflicts
$mainRes->setMetadata($sbjMeta->copyExcept(new PT($this->schema->id)));
$mainRes->updateMetadata();
foreach ($repoResources as $resToMerge) {
$this->log?->info(" Merging " . $resToMerge->getUri() . " into " . $mainRes->getUri());
/** @var RepoResource $resToMerge */
$resToMerge->merge($mainRes->getUri(), readMode: RepoResource::META_RESOURCE);
}
$mainRes->loadMetadata(true, RepoResource::META_RESOURCE);
}

if ($test) {
$this->repo->rollback();
} else {
$this->repo->commit();
}

if ($output !== null) {
RdfIoUtil::serialize($mainRes->getGraph(), 'text/turtle', $output, $nmsp);
}
} catch (RepoLibException | ClientException $e) {
$this->log?->error("Failed to update $sbj with: " . print_r($e, true));
} finally {
if ($this->repo->inTransaction()) {
$this->repo->rollback();
}
}
}
}

private function mergeInto(DatasetInterface $into, DatasetInterface $from): void {
$this->log?->debug("Merging " . $from->getSubject() . " into " . $into->getSubject());
$intoSbj = $into->getSubject();
foreach ($from->listPredicates() as $pred) {
$tmpl = new PT($pred);
if ($into->none($tmpl)) {
$into->add($from->map(fn(QuadInterface $q) => $q->withSubject($intoSbj), $tmpl));
}
}
$into->add($from->map(fn(QuadInterface $q) => $q->withSubject($intoSbj), new PT($this->schema->id)));
}
}
4 changes: 2 additions & 2 deletions src/acdhOeaw/arche/refSources/NamedEntityIteratorFile.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
use termTemplates\QuadTemplate as QT;
use termTemplates\PredicateTemplate as PT;
use termTemplates\ValueTemplate as VT;
use quickRdfIo\Util as ioUtil;
use quickRdfIo\Util as RdfIoUtil;
use zozlak\RdfConstants as RDF;
use acdhOeaw\arche\lib\Schema;

Expand Down Expand Up @@ -67,7 +67,7 @@ class NamedEntityIteratorFile implements NamedEntityIteratorInterface {
public function __construct(mixed $rdfFilePath, Schema $schema,
string | null $format = null) {
$this->graph = new Dataset();
$this->graph->add(ioUtil::parse($rdfFilePath, new DataFactory(), $format));
$this->graph->add(RdfIoUtil::parse($rdfFilePath, new DataFactory(), $format));
$this->schema = $schema;
}

Expand Down
9 changes: 8 additions & 1 deletion src/acdhOeaw/arche/refSources/NamedEntityIteratorRepo.php
Original file line number Diff line number Diff line change
Expand Up @@ -68,19 +68,26 @@ public function setFilter(?string $class = null, ?string $idMatch = null,
if (!empty($minModDate)) {
$this->searchTerms[] = new SearchTerm($this->schema->modificationDate, $minModDate, '>=', SearchTerm::TYPE_DATETIME);
}
if (count($this->searchTerms) === 0) {
throw new RefSourcesException('At least one filter has to be defined');
}

$this->searchConfig = new SearchConfig();
$this->searchConfig->limit = $limit;
$this->searchConfig->metadataMode = RepoResourceInterface::META_RESOURCE;
}

/**
*
* @return \Generator<NamedEntityRepo>
*/
public function getNamedEntities(): \Generator {
foreach ($this->repo->getResourcesBySearchTerms($this->searchTerms, $this->searchConfig) as $res) {
yield new NamedEntityRepo($res);
}
}

public function count(): int {
return $this->searchConfig->count ?? 0;
return min($this->searchConfig->count ?? 0, $this->searchConfig->limit ?? PHP_INT_MAX);
}
}
3 changes: 1 addition & 2 deletions src/acdhOeaw/arche/refSources/ProcessEntityResult.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@
*/
class ProcessEntityResult {

function __construct(public NamedEntityInterface $entity,
public DatasetNodeInterface $newData,
function __construct(public DatasetNodeInterface $newData,
public DatasetInterface $oldData) {

}
Expand Down
Loading

0 comments on commit e30aac7

Please sign in to comment.