From 1ba3b205002d64d3717c8c8201a708100035a17b Mon Sep 17 00:00:00 2001
From: Adam Reichold <adam.reichold@uba.de>
Date: Wed, 5 Feb 2025 13:17:43 +0000
Subject: [PATCH] Include currentness in inherent score to prefer newer
 datasets.

The values were determined using particle swarm optimzation and
NCDG loss against a set of manually ranked queries.
---
 src/index/indexer.rs    | 65 +++++++++++++++++++++++++++++------------
 src/index/time_range.rs |  6 ++--
 src/lib.rs              |  6 ++++
 3 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/src/index/indexer.rs b/src/index/indexer.rs
index 27d823ee3a..3d408d12cb 100644
--- a/src/index/indexer.rs
+++ b/src/index/indexer.rs
@@ -10,12 +10,14 @@ use tantivy::{
     store::Compressor,
     Index, IndexSettings, IndexWriter,
 };
+use time::{Date, OffsetDateTime};
 
 use crate::{
     dataset::{
         quality::QualityLandingPage, r#type::Type, Alternative, Dataset, Organisation, Region,
         ResourceType, Status, Tag,
     },
+    date_as_scalar,
     index::{
         bounding_box::BoundingBoxes, index_reader, register_tokenizers, schema,
         time_range::TimeRanges, Fields,
@@ -28,6 +30,7 @@ pub struct Indexer {
     index: Index,
     writer: IndexWriter,
     fields: Arc<Fields>,
+    today: Date,
     upgrade: Option<PathBuf>,
 }
 
@@ -63,10 +66,13 @@ impl Indexer {
         let writer = index.writer(memory_budget * 1024 * 1024)?;
         writer.delete_all_documents()?;
 
+        let today = OffsetDateTime::now_utc().date();
+
         Ok(Self {
             index,
             writer,
             fields,
+            today,
             upgrade: upgrade.then_some(index_path),
         })
     }
@@ -81,6 +87,8 @@ impl Indexer {
     ) -> Result<()> {
         let mut doc = Document::default();
 
+        self.add_inherent_score(&dataset, accesses, &mut doc);
+
         doc.add_text(self.fields.title, dataset.title);
 
         if let Some(description) = dataset.description {
@@ -221,22 +229,6 @@ impl Indexer {
             dataset.mandatory_registration,
         );
 
-        const QUALITY_WEIGHT: f32 = 0.15;
-        const POPULARITY_WEIGHT: f32 = 0.05;
-        const BM25_WEIGHT: f32 = 1.0 - QUALITY_WEIGHT - POPULARITY_WEIGHT;
-        const {
-            assert!(BM25_WEIGHT > 0.0);
-        }
-
-        let quality = 1.0 + dataset.quality.score;
-        let popularity = ((2 + accesses) as f32).log2();
-        let inherent_score = quality.powf(QUALITY_WEIGHT / BM25_WEIGHT)
-            * popularity.powf(POPULARITY_WEIGHT / BM25_WEIGHT)
-            * dataset.quality.accessibility.landing_page.inherent_score()
-            * dataset.status.inherent_score();
-
-        doc.add_f64(self.fields.inherent_score, inherent_score as f64);
-
         let eligible_for_random = matches!(dataset.status, Status::Active)
             && matches!(
                 dataset.quality.accessibility.landing_page,
@@ -266,8 +258,8 @@ impl Indexer {
         }
 
         for time_range in dataset.time_ranges {
-            let from = (time_range.from.year() << 9) | time_range.from.ordinal() as i32;
-            let until = (time_range.until.year() << 9) | time_range.until.ordinal() as i32;
+            let from = date_as_scalar(time_range.from);
+            let until = date_as_scalar(time_range.until);
 
             doc.add_i64(self.fields.time_range_from, from as i64);
             doc.add_i64(self.fields.time_range_until, until as i64);
@@ -282,6 +274,43 @@ impl Indexer {
         Ok(())
     }
 
+    fn add_inherent_score(&self, dataset: &Dataset, accesses: u64, doc: &mut Document) {
+        let quality = 1.0 + dataset.quality.score;
+
+        let popularity = ((2 + accesses) as f32).log2();
+
+        let latest_date = dataset
+            .time_ranges
+            .iter()
+            .map(|time_range| time_range.until)
+            .chain(dataset.issued)
+            .chain(dataset.modified)
+            .max();
+
+        let currentness = latest_date.map_or(1.0, |date| {
+            let age = date_as_scalar(date.min(self.today)) - date_as_scalar(self.today);
+
+            const RATE: f32 = 1.0 / 2173.913; // in one over days
+            1.0 + (RATE * age as f32).exp() // maps the range (-∞, 0] to (1, 2]
+        });
+
+        const QUALITY_WEIGHT: f32 = 0.028;
+        const POPULARITY_WEIGHT: f32 = 0.024;
+        const CURRENTNESS_WEIGHT: f32 = 0.089;
+        const BM25_WEIGHT: f32 = 1.0 - QUALITY_WEIGHT - POPULARITY_WEIGHT - CURRENTNESS_WEIGHT;
+        const {
+            assert!(BM25_WEIGHT > 0.0);
+        }
+
+        let inherent_score = quality.powf(QUALITY_WEIGHT / BM25_WEIGHT)
+            * popularity.powf(POPULARITY_WEIGHT / BM25_WEIGHT)
+            * currentness.powf(CURRENTNESS_WEIGHT / BM25_WEIGHT)
+            * dataset.quality.accessibility.landing_page.inherent_score()
+            * dataset.status.inherent_score();
+
+        doc.add_f64(self.fields.inherent_score, inherent_score as f64);
+    }
+
     pub fn commit(mut self) -> Result<()> {
         self.writer.commit()?;
 
diff --git a/src/index/time_range.rs b/src/index/time_range.rs
index 9fb0147b3b..f44660a809 100644
--- a/src/index/time_range.rs
+++ b/src/index/time_range.rs
@@ -28,7 +28,7 @@ use tantivy::{
 };
 use tantivy_common::BitSet;
 
-use crate::{dataset::TimeRange, index::scorer::Scores, minmax};
+use crate::{dataset::TimeRange, date_as_scalar, index::scorer::Scores, minmax};
 
 type SegmentKey = (SegmentId, Option<Opstamp>);
 
@@ -196,8 +196,8 @@ impl<const SCORING: bool> Weight for TimeRangeWeight<SCORING> {
 }
 
 fn as_interval(TimeRange { from, until }: TimeRange) -> Range<i32> {
-    let from = (from.year() << 9) | from.ordinal() as i32;
-    let until = (until.year() << 9) | until.ordinal() as i32;
+    let from = date_as_scalar(from);
+    let until = date_as_scalar(until);
 
     from..(until + 1)
 }
diff --git a/src/lib.rs b/src/lib.rs
index 507a9831a8..90087a919a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -32,6 +32,7 @@ use serde::{
     ser::{Serialize, Serializer},
 };
 use tantivy::{schema::Facet, Score};
+use time::Date;
 
 pub fn data_path_from_env() -> PathBuf {
     var_os("DATA_PATH")
@@ -302,6 +303,11 @@ pub fn get_two_mut<T>(slice: &mut [T], idx1: usize, idx2: usize) -> (&mut T, &mu
     (val1, val2)
 }
 
+#[inline]
+pub fn date_as_scalar(date: Date) -> i32 {
+    (date.year() << 9) | date.ordinal() as i32
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
-- 
GitLab