diff --git a/lib/controllers/v1/projects_controller.js b/lib/controllers/v1/projects_controller.js index 45785124..cbbde601 100644 --- a/lib/controllers/v1/projects_controller.js +++ b/lib/controllers/v1/projects_controller.js @@ -10,6 +10,7 @@ const Project = require( "../../models/project" ); const Site = require( "../../models/site" ); const User = require( "../../models/user" ); const ObservationsController = require( "./observations_controller" ); +const ElasticQueryBuilder = require( "../../elastic_query_builder" ); const ProjectsController = class ProjectsController { static async searchCriteria( req, options = { } ) { @@ -176,48 +177,25 @@ const ProjectsController = class ProjectsController { static async autocomplete( req ) { InaturalistAPI.setPerPage( req, { default: 10, max: 300 } ); + const { page, perPage } = InaturalistAPI.paginationData( req ); const searchCriteria = await ProjectsController.searchCriteria( req, { autocomplete: true } ); if ( !searchCriteria ) { return InaturalistAPI.basicResponse( req ); } - const response = await esClient.search( "projects", { - body: { - query: { - function_score: { - query: { - bool: { - filter: searchCriteria.filters, - must_not: searchCriteria.inverse_filters, - should: [ - { - constant_score: { - filter: { - multi_match: { - query: ( req.query && req.query.q ) || "", - fields: ["*_autocomplete", "description"], - type: "phrase" - } - }, - boost: 1 - } - } - ] - } - }, - field_value_factor: { - field: "universal_search_rank", - factor: 1, - missing: 3, - modifier: "log2p" - }, - boost_mode: "sum" - } - }, - _source: Project.returnFields, - size: req.query.per_page, - sort: searchCriteria.sort - } + const { q } = req.query; + const body = ElasticQueryBuilder.buildQuery( { + q, + sources: ["projects"], + page, + perPage, + req, + filters: searchCriteria.filters, + inverseFilters: searchCriteria.inverse_filters, + sort: searchCriteria.sort, + source: Project.returnFields, + useFunctionScore: true } ); + const response = await esClient.search( "projects", { body } ); return ProjectsController.esResponseToAPIResponse( req, response ); } diff --git a/lib/controllers/v1/search_controller.js b/lib/controllers/v1/search_controller.js index a6b636e9..41b6b42e 100644 --- a/lib/controllers/v1/search_controller.js +++ b/lib/controllers/v1/search_controller.js @@ -7,6 +7,7 @@ const User = require( "../../models/user" ); const util = require( "../../util" ); const InaturalistAPI = require( "../../inaturalist_api" ); const TaxaController = require( "./taxa_controller" ); +const ElasticQueryBuilder = require( "../../elastic_query_builder" ); const SearchController = { }; @@ -25,22 +26,11 @@ SearchController.search = async req => { } } const q = req.query ? req.query.q : ""; - const isID = Number.isInteger( Number( q ) ) && Number( q ) > 0; + if ( _.isEmpty( q ) ) { return InaturalistAPI.basicResponse( req ); } const { page, perPage } = InaturalistAPI.paginationData( req ); - // Things that absolutely must be included - const filter = [ - // Sometimes cruft piles up. We don't want to return it - { exists: { field: "id" } } - ]; - if ( req.query && req.query.place_id ) { - filter.push( { - terms: { associated_place_ids: [req.query.place_id] } - } ); - } - // Things that absolutely must NOT be included const mustNot = [ { term: { @@ -58,166 +48,31 @@ SearchController.search = async req => { } } ]; - // The interesting stuff - const should = [ - // match _autocomplete fields across all indices - { - constant_score: { - filter: { - multi_match: { - query: q, - fields: ["*_autocomplete", "name"], - fuzziness: "AUTO", - prefix_length: 5, - max_expansions: 2, - operator: "and" - } - }, - boost: 1 - } - }, - // match the nested name_autocomplete field in the taxa index - { - constant_score: { - filter: { - nested: { - path: "names", - ignore_unmapped: true, - query: { - match: { - "names.name_autocomplete": { - fuzziness: "AUTO", - prefix_length: 5, - query: q, - operator: "and" - } - } - } - } - }, - boost: 2 - } - }, - // boost exact matches in the taxa index - { - constant_score: { - filter: { - nested: { - path: "names", - ignore_unmapped: true, - query: { - match: { - "names.exact_ci": { - query: q - } - } - } - } - }, - boost: 3 - } - }, - // boost exact matches in the users index - { - constant_score: { - filter: { - bool: { - should: [ - { - match: { - login_exact: { - query: q - } - } - }, - { - match: { - orcid: { - query: q - } - } - } - ] - } - }, - boost: 3 - } - }, - // boost exact matches across the rest of the indices Note: this - // isn't working perfectly. For one thing it matches more than - // exact matches, e.g. when you search for "lepidopt" you get a - // lot of projects about Lepidoptera. It also seems to score - // projects with multuple mentions of lepidoptera in the desc - // higher than the taxon Lepidoptera if you boost. Boost at 1 is - // ok, but a better solution would be to actually do exact - // matching and score docs equally regardless of term frequency. - { - constant_score: { - filter: { - multi_match: { - query: q, - fields: ["*_autocomplete", "description"], - type: "phrase" - } - }, - boost: 1 - } - } + const filters = [ + { exists: { field: "id" } } ]; - if ( isID ) { - should.push( { - constant_score: { - filter: { - term: { id: Number( q ) } - }, - boost: 3 - } + if ( req.query && req.query.place_id ) { + filters.push( { + terms: { associated_place_ids: [req.query.place_id] } } ); } - // Add the shoulds to the filter. Without this, the shoulds will operate only - // in the query context and won't filter out non-matching documents, e.g. if - // you search for "moth" you'll get back documents that do not contain the - // word moth, and if they get a higher score due to higher obs count, they can - // appear above more relevant matches - filter.push( { - bool: { - should - } - } ); - const body = { - from: ( perPage * page ) - perPage, - size: perPage, - query: { - function_score: { - query: { - bool: { - filter, - must_not: mustNot, - should - } - }, - field_value_factor: { - field: "universal_search_rank", - factor: 1, - missing: 3, - modifier: "log2p" - }, - boost_mode: "sum" + const body = ElasticQueryBuilder.buildQuery( { + q, + sources, + page, + perPage, + req, + mustNot, + filters, + useFunctionScore: true, + highlight: { + fields: { + "names.exact_ci": { }, + "*_autocomplete": { }, + description: { } } - }, - _source: { excludes: User.elasticExcludeFields } - }; - const highlight = { - fields: { - "names.exact_ci": { }, - "*_autocomplete": { }, - description: { } } - }; - if ( util.isJa( q ) ) { - highlight.fields["names.name_ja"] = { }; - } - body.highlight = highlight; + } ); const response = await esClient.search( sources, { body } ); const localeOpts = util.localeOpts( req ); const results = _.compact( _.map( response.hits.hits, h => { diff --git a/lib/controllers/v1/taxa_controller.js b/lib/controllers/v1/taxa_controller.js index bf2261ae..6e639dbf 100644 --- a/lib/controllers/v1/taxa_controller.js +++ b/lib/controllers/v1/taxa_controller.js @@ -10,6 +10,7 @@ const Place = require( "../../models/place" ); const Taxon = require( "../../models/taxon" ); const InaturalistAPI = require( "../../inaturalist_api" ); const RedisCacheClient = require( "../../redis_cache_client" ); +const ElasticQueryBuilder = require( "../../elastic_query_builder" ); const TaxaController = { }; @@ -30,140 +31,7 @@ TaxaController.show = async req => { } ); }; -TaxaController.exact = async req => { - const params = req.query; - const q = params.q || params.term; - if ( !q || q.length < 2 ) { return null; } - const inverseFilters = []; - const filters = [{ - nested: { - path: "names", - query: { - match: { - "names.exact_ci": { - query: q - } - } - } - } - }]; - const highlight = { - fields: { "names.exact_ci": { } }, - order: "score" - }; - if ( util.isJa( q ) ) { - filters.push( { - nested: { - path: "names", - query: { - multi_match: { - query: q, - fields: ["names.name_ja^10", "names.exact_ci"] - } - } - } - } ); - highlight.fields["names.name_ja"] = { }; - } - if ( params.taxon_id ) { - filters.push( esClient.termFilter( "ancestor_ids", params.taxon_id ) ); - } - if ( req.query.not_id ) { - inverseFilters.push( esClient.termFilter( "id", params.not_id ) ); - } - if ( params.rank ) { - filters.push( esClient.termFilter( "rank", params.rank ) ); - } - if ( params.rank_level ) { - filters.push( esClient.termFilter( "rank_level", params.rank_level ) ); - } - if ( req.inat.observedByUser ) { - const observedTaxonIDs = await ESModel.userObservedTaxonIDsCached( - req.inat.observedByUser.id - ); - if ( !_.isEmpty( observedTaxonIDs ) ) { - filters.push( esClient.termFilter( "id", observedTaxonIDs ) ); - } - } - if ( params.iconic ) { - filters.push( esClient.termFilter( "id", _.map( Taxon.iconicTaxaByID, "id" ) ) ); - } - let should = null; - const localeOpts = util.localeOpts( req ); - const preferredPlace = req.query.preferredPlace || localeOpts.preferredPlace; - if ( preferredPlace ) { - should = should || []; - const placeIDs = preferredPlace.ancestor_place_ids || [preferredPlace.id]; - should.push( { - // Within the should, though, we want a higher score if the taxon has a - // name in the locale AND that name matches the query, hence the must - nested: { - path: "names", - query: { - bool: { - must: [ - { - match: { - "names.name_autocomplete": { - query: q, - operator: "and" - } - } - }, - { - terms: { - "names.place_taxon_names.place_id": placeIDs - } - } - ] - } - } - } - } ); - } - let isActive = true; - if ( req.query.is_active === "false" ) { - isActive = false; - } else if ( req.query.is_active === "any" ) { - isActive = null; - } - if ( isActive !== null ) { - filters.push( esClient.termFilter( "is_active", isActive ) ); - } - req.query.page = 1; - req.query.per_page = 5; - req.elastic_query = { - query: { - function_score: { - query: { - bool: { - should, - must_not: inverseFilters, - filter: filters - } - }, - field_value_factor: { - field: "observations_count", - modifier: "log1p", - factor: 2 - }, - boost_mode: "sum" - } - }, - highlight, - sort: "_score" - }; - return TaxaController.searchQuery( req ); -}; - TaxaController.autocomplete = async req => { - const localeOpts = util.localeOpts( req ); - // not sending the actual req, rather making a copy - const exactResponse = await TaxaController.exact( _.cloneDeep( req ) ); - const exactResults = ( - exactResponse - && exactResponse.results - && exactResponse.results.length > 0 ) ? exactResponse.results : null; InaturalistAPI.setPerPage( req, { default: 10, max: 30 } ); const params = _.cloneDeep( req.query ); const q = params.q || params.term; @@ -179,32 +47,10 @@ TaxaController.autocomplete = async req => { } else if ( req.query.is_active === "any" ) { isActive = null; } - req.query.page = 1; + const { page, perPage } = InaturalistAPI.paginationData( req ); // Make sure we don't show things that don't match all query tokens const inverseFilters = []; - const options = {}; - const filters = [ - { - nested: { - path: "names", - query: { - match: { - "names.name_autocomplete": { - query: q, - operator: "and" - } - } - } - } - } - ]; - - const isID = Number.isInteger( Number( q ) ); - - if ( isID ) { - options.filters = []; - options.filters.push( { terms: { id: [q] } } ); - } + const filters = []; if ( isActive !== null ) { filters.push( esClient.termFilter( "is_active", isActive ) ); @@ -232,181 +78,11 @@ TaxaController.autocomplete = async req => { if ( req.query.iconic ) { filters.push( esClient.termFilter( "id", _.map( Taxon.iconicTaxaByID, "id" ) ) ); } + const highlight = { fields: { "names.name_autocomplete": { } }, order: "score" }; - const should = []; - // multi-token matches, e.g. if you search "foo bar" that should match "foo barness" - should.push( { - constant_score: { - filter: { - nested: { - path: "names", - query: { - match: { - "names.name_autocomplete": { - query: q, - operator: "and" - } - } - } - } - }, - boost: 1 - } - } ); - // Exact prefix matches - should.push( { - constant_score: { - filter: { - nested: { - path: "names", - query: { - prefix: { - "names.exact_ci": { - value: q - } - } - } - } - }, - boost: 1 - } - } ); - // extra boosting for exact prefixes of scientific names - should.push( { - constant_score: { - filter: { - nested: { - path: "names", - query: { - bool: { - must: [ - { - prefix: { - "names.exact_ci": { - value: q.toLowerCase( ) - } - } - }, - { - term: { - "names.locale": "sci" - } - } - ] - } - } - } - }, - boost: 2 - } - } ); - if ( localeOpts.locale ) { - const localeLang = localeOpts.locale.split( "-" )[0].toLowerCase( ); - // We want locale-specific matches to have a higher score, but we don't - // want to *only* show locale matches, hence the should - should.push( { - // Constant score allows us to boost name and locale matches higher than - // place-specific matches. Without this we end up with queries like "bi" - // matching names like "birds-foot trefoil" that have been added to a - // place higher than names like "birds" - constant_score: { - filter: { - // Within the should, though, we want a higher score if the taxon has a - // name in the locale AND that name matches the query, hence the must - nested: { - path: "names", - query: { - bool: { - must: [ - { - match: { - "names.name_autocomplete": { - query: q, - operator: "and" - } - } - }, - { - term: { - "names.locale": localeLang - } - } - ] - } - } - } - }, - boost: 1 - } - } ); - // ...and we need to add another boost for locale-specific prefix matches - should.push( { - constant_score: { - filter: { - nested: { - path: "names", - query: { - bool: { - must: [ - { - prefix: { - "names.exact_ci": { - value: q - } - } - }, - { - term: { - "names.locale": localeLang - } - } - ] - } - } - } - }, - boost: 2 - } - } ); - } - if ( localeOpts.preferredPlace ) { - const placeIDs = localeOpts.preferredPlace.ancestor_place_ids - || [localeOpts.preferredPlace.id]; - should.push( { - constant_score: { - filter: { - // Within the should, though, we want a higher score if the taxon has a - // name in the locale AND that name matches the query, hence the must - nested: { - path: "names", - query: { - bool: { - must: [ - { - match: { - "names.name_autocomplete": { - query: q, - operator: "and" - } - } - }, - { - terms: { - "names.place_taxon_names.place_id": placeIDs - } - } - ] - } - } - } - }, - boost: 1.5 - } - } ); - } if ( util.isJa( q ) ) { filters.push( { nested: { @@ -414,46 +90,34 @@ TaxaController.autocomplete = async req => { query: { multi_match: { query: q, - fields: ["names.name_autocomplete_ja^10", "names.name_autocomplete"] + fields: ["names.name_ja^10", "names.exact_ci"] } } } } ); - highlight.fields["names.name_autocomplete_ja"] = { }; + highlight.fields["names.exact_ci"] = { }; + highlight.fields["names.name_ja"] = { }; } - req.elastic_query = { - query: { - function_score: { - query: { - bool: { - should, - must_not: inverseFilters, - filter: filters - } - }, - field_value_factor: { - field: "observations_count", - modifier: "log1p", - factor: 2 - }, - boost_mode: "sum" - } + + req.elastic_query = ElasticQueryBuilder.buildQuery( { + q, + sources: ["taxa"], + page, + perPage, + req, + filters, + inverseFilters, + useFunctionScore: true, + functionScoreFieldValueFactor: { + field: "observations_count", + modifier: "log1p", + factor: 2, + missing: 0 }, - highlight, - sort: "_score", - size: req.query.per_page - }; + highlight + } ); - const response = await TaxaController.searchQuery( req, options ); - if ( response && response.results && exactResults ) { - const exactResultIDs = _.map( exactResults, "id" ); - response.results = _.reject( response.results, r => _.includes( exactResultIDs, r.id ) ); - response.results = exactResults.concat( response.results ); - if ( response.total_results < response.results.length ) { - response.total_results = response.results.length; - } - } - return response; + return TaxaController.searchQuery( req ); }; TaxaController.wanted = async req => { @@ -651,7 +315,6 @@ TaxaController.search = async req => { } ); } } - const searchOptions = { filters }; if ( params.details === "all" ) { searchOptions.details = true; diff --git a/lib/elastic_query_builder.js b/lib/elastic_query_builder.js new file mode 100644 index 00000000..0b4d8b42 --- /dev/null +++ b/lib/elastic_query_builder.js @@ -0,0 +1,374 @@ +const _ = require( "lodash" ); +const util = require( "./util" ); +const User = require( "./models/user" ); + +const ElasticQueryBuilder = { }; + +ElasticQueryBuilder.buildQuery = opts => { + const options = { + filters: [], + mustNot: [], + should: [], + ...opts + }; + const { + q, + sources, + page, + perPage, + req + } = options; + const filter = options.filters; + const { mustNot } = options; + const { should } = options; + const userLanguages = []; + if ( req.userSession && req.userSession.taxonNamePriorities ) { + req.userSession.taxonNamePriorities.forEach( tnp => { + if ( tnp.lexicon ) { + userLanguages.push( tnp.lexicon ); + } + } ); + } + const localeOpts = util.localeOpts( req ); + if ( localeOpts.locale ) { + userLanguages.push( localeOpts.locale.split( "-" )[0] ); + } + const uniqueUserLanguages = _.uniq( userLanguages ); + + if ( q ) { + const isID = Number.isInteger( Number( q ) ) && Number( q ) > 0; + // match _autocomplete fields across all indices + should.push( { + constant_score: { + filter: { + multi_match: { + query: q, + fields: ["*_autocomplete", "name"], + fuzziness: "AUTO", + prefix_length: 5, + max_expansions: 2, + operator: "and" + } + }, + boost: 1 + } + } ); + should.push( { + // boost exact matches across the rest of the indices Note: this + // isn't working perfectly. For one thing it matches more than + // exact matches, e.g. when you search for "lepidopt" you get a + // lot of projects about Lepidoptera. It also seems to score + // projects with multuple mentions of lepidoptera in the desc + // higher than the taxon Lepidoptera if you boost. Boost at 1 is + // ok, but a better solution would be to actually do exact + // matching and score docs equally regardless of term frequency. + constant_score: { + filter: { + multi_match: { + query: q, + fields: ["*_autocomplete", "description"], + type: "phrase" + } + }, + boost: 1 + } + } ); + if ( isID ) { + should.push( { + constant_score: { + filter: { + term: { id: Number( q ) } + }, + boost: 3 + } + } ); + } + + if ( !sources || _.includes( sources, "taxa" ) ) { + // multi-token matches, e.g. if you search "foo bar" that should match "foo barness" + should.push( { + constant_score: { + filter: { + nested: { + path: "names", + ignore_unmapped: true, + query: { + match: { + "names.name_autocomplete": { + fuzziness: "AUTO", + prefix_length: 5, + query: q, + operator: "and" + } + } + } + } + }, + boost: 2 + } + } ); + if ( util.isJa( q ) ) { + should.push( { + constant_score: { + filter: { + nested: { + path: "names", + ignore_unmapped: true, + query: { + match: { + "names.name_ja": { + query: q, + operator: "and" + } + } + } + } + }, + boost: 2 + } + } ); + } + // Exact prefix matches + should.push( { + constant_score: { + filter: { + nested: { + path: "names", + ignore_unmapped: true, + query: { + match: { + "names.exact_ci": { + query: q + } + } + } + } + }, + boost: 3 + } + } ); + // extra boosting for exact prefixes of scientific names + should.push( { + constant_score: { + filter: { + nested: { + path: "names", + ignore_unmapped: true, + query: { + bool: { + must: [ + { + prefix: { + "names.exact_ci": { + value: q.toLowerCase( ) + } + } + }, + { + term: { + "names.locale": "sci" + } + } + ] + } + } + } + }, + boost: 2 + } + } ); + uniqueUserLanguages.forEach( localeLang => { + should.push( { + // Constant score allows us to boost name and locale matches higher than + // place-specific matches. Without this we end up with queries like "bi" + // matching names like "birds-foot trefoil" that have been added to a + // place higher than names like "birds" + constant_score: { + filter: { + // Within the should, though, we want a higher score if the taxon has a + // name in the locale AND that name matches the query, hence the must + nested: { + path: "names", + ignore_unmapped: true, + query: { + bool: { + must: [ + { + match: { + "names.name_autocomplete": { + query: q, + operator: "and" + } + } + }, + { + term: { + "names.locale": localeLang + } + } + ] + } + } + } + }, + boost: 1 + } + } ); + // ...and we need to add another boost for locale-specific prefix matches + should.push( { + constant_score: { + filter: { + nested: { + path: "names", + ignore_unmapped: true, + query: { + bool: { + must: [ + { + prefix: { + "names.exact_ci": { + value: q + } + } + }, + { + term: { + "names.locale": localeLang + } + } + ] + } + } + } + }, + boost: 2 + } + } ); + } ); + if ( localeOpts.preferredPlace ) { + const placeIDs = localeOpts.preferredPlace.ancestor_place_ids + || [localeOpts.preferredPlace.id]; + should.push( { + constant_score: { + filter: { + // Within the should, though, we want a higher score if the taxon has a + // name in the locale AND that name matches the query, hence the must + nested: { + path: "names", + ignore_unmapped: true, + query: { + bool: { + must: [ + { + match: { + "names.name_autocomplete": { + query: q, + operator: "and" + } + } + }, + { + terms: { + "names.place_taxon_names.place_id": placeIDs + } + } + ] + } + } + } + }, + boost: 1.5 + } + } ); + } + } + if ( !sources || _.includes( sources, "users" ) ) { + // boost exact matches in the users index + should.push( { + constant_score: { + filter: { + bool: { + should: [ + { + match: { + login_exact: { + query: q + } + } + }, + { + match: { + orcid: { + query: q + } + } + } + ] + } + }, + boost: 3 + } + } ); + } + } + // Add the shoulds to the filter. Without this, the shoulds will operate only + // in the query context and won't filter out non-matching documents, e.g. if + // you search for "moth" you'll get back documents that do not contain the + // word moth, and if they get a higher score due to higher obs count, they can + // appear above more relevant matches + filter.push( { + bool: { + should + } + } ); + + const body = { + from: ( perPage * page ) - perPage, + size: perPage, + query: { + bool: { + filter, + must_not: mustNot, + should + } + } + }; + + if ( options.useFunctionScore ) { + const functionScore = { + query: body.query, + boost_mode: "sum" + }; + if ( options.functionScoreFieldValueFactor ) { + functionScore.field_value_factor = options.functionScoreFieldValueFactor; + } else { + functionScore.field_value_factor = { + field: "universal_search_rank", + factor: 1, + missing: 3, + modifier: "log2p" + }; + } + body.query = { function_score: functionScore }; + } + if ( options.sort ) { + body.sort = options.sort; + } + + if ( options.source ) { + body._source = options.source; + } else { + body._source = { excludes: User.elasticExcludeFields }; + } + + if ( options.highlight ) { + const { highlight } = options; + if ( util.isJa( q ) ) { + highlight.fields["names.name_ja"] = { }; + } + body.highlight = highlight; + } + return body; +}; + +module.exports = ElasticQueryBuilder;