From 1ba3b205002d64d3717c8c8201a708100035a17b Mon Sep 17 00:00:00 2001 From: Adam Reichold <adam.reichold@uba.de> Date: Wed, 5 Feb 2025 13:17:43 +0000 Subject: [PATCH] Include currentness in inherent score to prefer newer datasets. The values were determined using particle swarm optimzation and NCDG loss against a set of manually ranked queries. --- src/index/indexer.rs | 65 +++++++++++++++++++++++++++++------------ src/index/time_range.rs | 6 ++-- src/lib.rs | 6 ++++ 3 files changed, 56 insertions(+), 21 deletions(-) diff --git a/src/index/indexer.rs b/src/index/indexer.rs index 27d823ee3a..3d408d12cb 100644 --- a/src/index/indexer.rs +++ b/src/index/indexer.rs @@ -10,12 +10,14 @@ use tantivy::{ store::Compressor, Index, IndexSettings, IndexWriter, }; +use time::{Date, OffsetDateTime}; use crate::{ dataset::{ quality::QualityLandingPage, r#type::Type, Alternative, Dataset, Organisation, Region, ResourceType, Status, Tag, }, + date_as_scalar, index::{ bounding_box::BoundingBoxes, index_reader, register_tokenizers, schema, time_range::TimeRanges, Fields, @@ -28,6 +30,7 @@ pub struct Indexer { index: Index, writer: IndexWriter, fields: Arc<Fields>, + today: Date, upgrade: Option<PathBuf>, } @@ -63,10 +66,13 @@ impl Indexer { let writer = index.writer(memory_budget * 1024 * 1024)?; writer.delete_all_documents()?; + let today = OffsetDateTime::now_utc().date(); + Ok(Self { index, writer, fields, + today, upgrade: upgrade.then_some(index_path), }) } @@ -81,6 +87,8 @@ impl Indexer { ) -> Result<()> { let mut doc = Document::default(); + self.add_inherent_score(&dataset, accesses, &mut doc); + doc.add_text(self.fields.title, dataset.title); if let Some(description) = dataset.description { @@ -221,22 +229,6 @@ impl Indexer { dataset.mandatory_registration, ); - const QUALITY_WEIGHT: f32 = 0.15; - const POPULARITY_WEIGHT: f32 = 0.05; - const BM25_WEIGHT: f32 = 1.0 - QUALITY_WEIGHT - POPULARITY_WEIGHT; - const { - assert!(BM25_WEIGHT > 0.0); - } - - let quality = 1.0 + dataset.quality.score; - let popularity = ((2 + accesses) as f32).log2(); - let inherent_score = quality.powf(QUALITY_WEIGHT / BM25_WEIGHT) - * popularity.powf(POPULARITY_WEIGHT / BM25_WEIGHT) - * dataset.quality.accessibility.landing_page.inherent_score() - * dataset.status.inherent_score(); - - doc.add_f64(self.fields.inherent_score, inherent_score as f64); - let eligible_for_random = matches!(dataset.status, Status::Active) && matches!( dataset.quality.accessibility.landing_page, @@ -266,8 +258,8 @@ impl Indexer { } for time_range in dataset.time_ranges { - let from = (time_range.from.year() << 9) | time_range.from.ordinal() as i32; - let until = (time_range.until.year() << 9) | time_range.until.ordinal() as i32; + let from = date_as_scalar(time_range.from); + let until = date_as_scalar(time_range.until); doc.add_i64(self.fields.time_range_from, from as i64); doc.add_i64(self.fields.time_range_until, until as i64); @@ -282,6 +274,43 @@ impl Indexer { Ok(()) } + fn add_inherent_score(&self, dataset: &Dataset, accesses: u64, doc: &mut Document) { + let quality = 1.0 + dataset.quality.score; + + let popularity = ((2 + accesses) as f32).log2(); + + let latest_date = dataset + .time_ranges + .iter() + .map(|time_range| time_range.until) + .chain(dataset.issued) + .chain(dataset.modified) + .max(); + + let currentness = latest_date.map_or(1.0, |date| { + let age = date_as_scalar(date.min(self.today)) - date_as_scalar(self.today); + + const RATE: f32 = 1.0 / 2173.913; // in one over days + 1.0 + (RATE * age as f32).exp() // maps the range (-∞, 0] to (1, 2] + }); + + const QUALITY_WEIGHT: f32 = 0.028; + const POPULARITY_WEIGHT: f32 = 0.024; + const CURRENTNESS_WEIGHT: f32 = 0.089; + const BM25_WEIGHT: f32 = 1.0 - QUALITY_WEIGHT - POPULARITY_WEIGHT - CURRENTNESS_WEIGHT; + const { + assert!(BM25_WEIGHT > 0.0); + } + + let inherent_score = quality.powf(QUALITY_WEIGHT / BM25_WEIGHT) + * popularity.powf(POPULARITY_WEIGHT / BM25_WEIGHT) + * currentness.powf(CURRENTNESS_WEIGHT / BM25_WEIGHT) + * dataset.quality.accessibility.landing_page.inherent_score() + * dataset.status.inherent_score(); + + doc.add_f64(self.fields.inherent_score, inherent_score as f64); + } + pub fn commit(mut self) -> Result<()> { self.writer.commit()?; diff --git a/src/index/time_range.rs b/src/index/time_range.rs index 9fb0147b3b..f44660a809 100644 --- a/src/index/time_range.rs +++ b/src/index/time_range.rs @@ -28,7 +28,7 @@ use tantivy::{ }; use tantivy_common::BitSet; -use crate::{dataset::TimeRange, index::scorer::Scores, minmax}; +use crate::{dataset::TimeRange, date_as_scalar, index::scorer::Scores, minmax}; type SegmentKey = (SegmentId, Option<Opstamp>); @@ -196,8 +196,8 @@ impl<const SCORING: bool> Weight for TimeRangeWeight<SCORING> { } fn as_interval(TimeRange { from, until }: TimeRange) -> Range<i32> { - let from = (from.year() << 9) | from.ordinal() as i32; - let until = (until.year() << 9) | until.ordinal() as i32; + let from = date_as_scalar(from); + let until = date_as_scalar(until); from..(until + 1) } diff --git a/src/lib.rs b/src/lib.rs index 507a9831a8..90087a919a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,6 +32,7 @@ use serde::{ ser::{Serialize, Serializer}, }; use tantivy::{schema::Facet, Score}; +use time::Date; pub fn data_path_from_env() -> PathBuf { var_os("DATA_PATH") @@ -302,6 +303,11 @@ pub fn get_two_mut<T>(slice: &mut [T], idx1: usize, idx2: usize) -> (&mut T, &mu (val1, val2) } +#[inline] +pub fn date_as_scalar(date: Date) -> i32 { + (date.year() << 9) | date.ordinal() as i32 +} + #[cfg(test)] mod tests { use super::*; -- GitLab