From 9f7894058aedcdc6dc2f62d80081845607b5e94e Mon Sep 17 00:00:00 2001
From: Adam Reichold <adam.reichold@uba.de>
Date: Tue, 1 Oct 2024 08:38:05 +0000
Subject: [PATCH 1/2] Expose score weight paramerters to facilitate their
 calibration.

---
 benches/search.rs         |  4 ++++
 server/src/ckan.rs        |  4 ++++
 server/src/search.rs      | 23 ++++++++++++------
 src/dataset/mod.rs        |  2 ++
 src/index/indexer.rs      | 22 +++++------------
 src/index/mod.rs          | 13 ++++++++++
 src/index/searcher/mod.rs | 50 ++++++++++++++++++++++++++++++++++++---
 src/stats.rs              |  2 +-
 xtask/src/main.rs         |  2 +-
 9 files changed, 94 insertions(+), 28 deletions(-)

diff --git a/benches/search.rs b/benches/search.rs
index 999d7c3d0d..5f5cb4f11f 100644
--- a/benches/search.rs
+++ b/benches/search.rs
@@ -22,6 +22,10 @@ fn main() {
                 black_box(false),
                 black_box(false),
                 black_box(None),
+                black_box(1.0),
+                black_box(1.0),
+                black_box(1.0),
+                black_box(1.0),
                 black_box(&[]),
                 black_box(100),
                 black_box(0),
diff --git a/server/src/ckan.rs b/server/src/ckan.rs
index 14e42fcd3e..fd52b3ca47 100644
--- a/server/src/ckan.rs
+++ b/server/src/ckan.rs
@@ -65,6 +65,10 @@ pub async fn package_search(
         false,
         false,
         None,
+        1.0,
+        1.0,
+        1.0,
+        1.0,
         Default::default(),
         params.rows,
         params.start,
diff --git a/server/src/search.rs b/server/src/search.rs
index 85bf7caeee..8a3430fb62 100644
--- a/server/src/search.rs
+++ b/server/src/search.rs
@@ -120,6 +120,10 @@ where
             params.bounding_box_contains,
             params.bounding_box_spatial_clusters,
             time_range,
+            params.popularity_weight.unwrap_or(1.0),
+            params.quality_weight.unwrap_or(1.0),
+            params.currentness_rate.unwrap_or(1.0),
+            params.currentness_weight.unwrap_or(1.0),
             &params.origin_weights,
             params.results_per_page,
             (params.page - 1) * params.results_per_page,
@@ -149,6 +153,10 @@ where
                 params.bounding_box_contains,
                 params.bounding_box_spatial_clusters,
                 time_range,
+                params.popularity_weight.unwrap_or(1.0),
+                params.quality_weight.unwrap_or(1.0),
+                params.currentness_rate.unwrap_or(1.0),
+                params.currentness_weight.unwrap_or(1.0),
                 &params.origin_weights,
                 params.results_per_page,
                 (params.page - 1) * params.results_per_page,
@@ -190,13 +198,6 @@ where
 
     let mut page = spawn_blocking(move || inner(params, searcher, cache, stats)).await??;
 
-    page.related_terms = fetch_similar_terms(
-        client,
-        &prometheus.similar_terms_request_duration,
-        &page.terms,
-    )
-    .await;
-
     Ok(accept.into_response(page))
 }
 
@@ -274,6 +275,14 @@ pub struct SearchParams<Q> {
     /// Upper bound of time range
     #[serde(default)]
     time_range_until: Option<Date>,
+    #[serde(default)]
+    popularity_weight: Option<f32>,
+    #[serde(default)]
+    quality_weight: Option<f32>,
+    #[serde(default)]
+    currentness_rate: Option<f32>,
+    #[serde(default)]
+    currentness_weight: Option<f32>,
     /// Enables selectively weighing the given origins
     ///
     /// This can be given multiple times and takes the form `<facet>^<weight>`
diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs
index 55ef83656c..fc45b7a147 100644
--- a/src/dataset/mod.rs
+++ b/src/dataset/mod.rs
@@ -398,6 +398,8 @@ pub struct ScoredDataset {
     pub value: UniquelyIdentifiedDataset,
     /// Score of this dataset by the search engine based on the given search parameters
     pub score: f32,
+    pub popularity: f64,
+    pub age: i64,
 }
 
 #[derive(ToSchema)]
diff --git a/src/index/indexer.rs b/src/index/indexer.rs
index 8b25aeb6f2..32cb8b8f9a 100644
--- a/src/index/indexer.rs
+++ b/src/index/indexer.rs
@@ -302,25 +302,15 @@ impl Indexer {
             .map(|time_range| time_range.until)
             .max();
 
-        let currentness = latest_date.map_or(1.0, |date| {
-            let age = date_as_scalar(date.min(self.today)) - date_as_scalar(self.today);
-
-            const RATE: f32 = 1.0 / 2173.913; // in one over days
-            1.0 + (RATE * age as f32).exp() // maps the range (-∞, 0] to (1, 2]
+        let age = latest_date.map_or(i32::MIN, |date| {
+            date_as_scalar(date.min(self.today)) - date_as_scalar(self.today)
         });
 
-        const QUALITY_WEIGHT: f32 = 0.028;
-        const POPULARITY_WEIGHT: f32 = 0.024;
-        const CURRENTNESS_WEIGHT: f32 = 0.089;
-        const BM25_WEIGHT: f32 = 1.0 - QUALITY_WEIGHT - POPULARITY_WEIGHT - CURRENTNESS_WEIGHT;
-        const {
-            assert!(BM25_WEIGHT > 0.0);
-        }
+        doc.add_f64(self.fields.popularity, quality as f64);
+        doc.add_f64(self.fields.quality, popularity as f64);
+        doc.add_i64(self.fields.age, age as i64);
 
-        let inherent_score = quality.powf(QUALITY_WEIGHT / BM25_WEIGHT)
-            * popularity.powf(POPULARITY_WEIGHT / BM25_WEIGHT)
-            * currentness.powf(CURRENTNESS_WEIGHT / BM25_WEIGHT)
-            * dataset.quality.accessibility.landing_page.inherent_score()
+        let inherent_score = dataset.quality.accessibility.landing_page.inherent_score()
             * dataset.status.inherent_score();
 
         doc.add_f64(self.fields.inherent_score, inherent_score as f64);
diff --git a/src/index/mod.rs b/src/index/mod.rs
index 5ba6bc0c03..c6465892e8 100644
--- a/src/index/mod.rs
+++ b/src/index/mod.rs
@@ -90,6 +90,10 @@ fn schema() -> Schema {
 
     schema.add_text_field("cas_rn", STRING);
 
+    schema.add_f64_field("popularity", FAST);
+    schema.add_f64_field("quality", FAST);
+    schema.add_i64_field("age", FAST);
+
     schema.add_f64_field("inherent_score", FAST);
     schema.add_bool_field("eligible_for_random", INDEXED);
 
@@ -133,6 +137,9 @@ struct Fields {
     watershed_id: Field,
     cas_rn: Field,
     inherent_score: Field,
+    popularity: Field,
+    quality: Field,
+    age: Field,
     eligible_for_random: Field,
     bounding_box_min_x: Field,
     bounding_box_min_y: Field,
@@ -183,6 +190,9 @@ impl Fields {
         let cas_rn = schema.get_field("cas_rn").unwrap();
 
         let inherent_score = schema.get_field("inherent_score").unwrap();
+        let popularity = schema.get_field("popularity").unwrap();
+        let quality = schema.get_field("quality").unwrap();
+        let age = schema.get_field("age").unwrap();
         let eligible_for_random = schema.get_field("eligible_for_random").unwrap();
 
         let bounding_box_min_x = schema.get_field("bounding_box_min_x").unwrap();
@@ -222,6 +232,9 @@ impl Fields {
             watershed_id,
             cas_rn,
             inherent_score,
+            popularity,
+            quality,
+            age,
             eligible_for_random,
             bounding_box_min_x,
             bounding_box_min_y,
diff --git a/src/index/searcher/mod.rs b/src/index/searcher/mod.rs
index 9aa93f642f..832aba4bff 100644
--- a/src/index/searcher/mod.rs
+++ b/src/index/searcher/mod.rs
@@ -155,13 +155,17 @@ impl Searcher {
         bounding_box_contains: bool,
         bounding_box_spatial_clusters: bool,
         time_range: Option<TimeRange>,
+        popularity_weight: f32,
+        quality_weight: f32,
+        currentness_rate: f32,
+        currentness_weight: f32,
         origin_weights: &[FacetWeight],
         limit: usize,
         offset: usize,
     ) -> Result<SearchResults> {
         let searcher = self.reader.searcher();
 
-        let key = if offset == 0 {
+        let key = if false {
             let key = search_key(
                 query,
                 types_root,
@@ -293,7 +297,13 @@ impl Searcher {
                 Count,
                 TopDocs::with_limit(limit)
                     .and_offset(offset)
-                    .tweak_score(ScoreTweaker { origin_weights }),
+                    .tweak_score(ScoreTweaker {
+                        popularity_weight,
+                        quality_weight,
+                        currentness_rate,
+                        currentness_weight,
+                        origin_weights,
+                    }),
                 (
                     types,
                     topics,
@@ -306,11 +316,20 @@ impl Searcher {
         let results = docs
             .into_iter()
             .map(|(score, doc)| {
+                let fast_fields = searcher.segment_reader(doc.segment_ord).fast_fields();
+                let popularity = fast_fields.f64("popularity")?.first(doc.doc_id).unwrap();
+                let age = fast_fields.i64("age")?.first(doc.doc_id).unwrap();
+
                 let doc = searcher.doc(doc)?;
 
                 let value = extract_fields(&self.fields, doc)?;
 
-                Ok(ScoredDataset { value, score })
+                Ok(ScoredDataset {
+                    value,
+                    score,
+                    popularity,
+                    age,
+                })
             })
             .collect::<Result<_>>()?;
 
@@ -776,6 +795,10 @@ fn last_term(ast: UserInputAst) -> Option<(String, Option<String>)> {
 }
 
 struct ScoreTweaker<'a> {
+    popularity_weight: f32,
+    quality_weight: f32,
+    currentness_rate: f32,
+    currentness_weight: f32,
     origin_weights: &'a [FacetWeight],
 }
 
@@ -817,12 +840,22 @@ impl TantivyScoreTweaker<Score> for ScoreTweaker<'_> {
     fn segment_tweaker(&self, reader: &SegmentReader) -> TantivyResult<Self::Child> {
         let fast_fields = reader.fast_fields();
         let inherent_score = fast_fields.f64("inherent_score")?.first_or_default_col(1.0);
+        let popularity = fast_fields.f64("popularity")?.first_or_default_col(1.0);
+        let quality = fast_fields.f64("quality")?.first_or_default_col(1.0);
+        let age = fast_fields.i64("age")?.first_or_default_col(0);
         let origins = fast_fields.str("origin")?.unwrap();
 
         let origin_weights = self.resolve_origin_weights(&origins)?;
 
         Ok(ScoreSegmentTweaker {
             inherent_score,
+            popularity_weight: self.popularity_weight,
+            popularity,
+            quality_weight: self.quality_weight,
+            quality,
+            currentness_rate: self.currentness_rate,
+            currentness_weight: self.currentness_weight,
+            age,
             origins,
             origin_weights,
         })
@@ -831,6 +864,13 @@ impl TantivyScoreTweaker<Score> for ScoreTweaker<'_> {
 
 struct ScoreSegmentTweaker {
     inherent_score: Arc<dyn ColumnValues<f64>>,
+    popularity_weight: f32,
+    popularity: Arc<dyn ColumnValues<f64>>,
+    quality_weight: f32,
+    quality: Arc<dyn ColumnValues<f64>>,
+    currentness_rate: f32,
+    currentness_weight: f32,
+    age: Arc<dyn ColumnValues<i64>>,
     origins: StrColumn,
     origin_weights: Vec<Score>,
 }
@@ -838,6 +878,10 @@ struct ScoreSegmentTweaker {
 impl TantivyScoreSegementTweaker<Score> for ScoreSegmentTweaker {
     fn score(&mut self, doc: DocId, mut score: Score) -> f32 {
         score *= self.inherent_score.get_val(doc) as f32;
+        score *= (self.popularity.get_val(doc) as f32).powf(self.popularity_weight);
+        score *= (self.quality.get_val(doc) as f32).powf(self.quality_weight);
+        score *= (1.0 + (self.currentness_rate * self.age.get_val(doc) as f32).exp())
+            .powf(self.currentness_weight);
 
         for origin in self.origins.term_ords(doc) {
             score *= self.origin_weights[origin as usize];
diff --git a/src/stats.rs b/src/stats.rs
index 6f932c0fa4..4914ca23be 100644
--- a/src/stats.rs
+++ b/src/stats.rs
@@ -107,7 +107,7 @@ impl Stats {
         let mut buf = {
             let mut this = this.lock();
 
-            this.decay(metrics);
+            // this.decay(metrics);
 
             options().serialize(&*this)?
         };
diff --git a/xtask/src/main.rs b/xtask/src/main.rs
index aeb24fae6a..ceec3f2b05 100644
--- a/xtask/src/main.rs
+++ b/xtask/src/main.rs
@@ -109,7 +109,7 @@ fn indexer() -> Result<()> {
 fn server() -> Result<()> {
     cargo(
         "Server",
-        &["run", "--package=server", "--bin=server"],
+        &["run", "--release", "--package=server", "--bin=server"],
         &[],
         &[
             ("DATA_PATH", "data"),
-- 
GitLab


From 652795d9ca8d450fbd32fda20be7bff0d09f3221 Mon Sep 17 00:00:00 2001
From: Adam Reichold <adam.reichold@uba.de>
Date: Wed, 5 Feb 2025 10:46:27 +0000
Subject: [PATCH 2/2] Show latest date on search result view for debugging our
 ranking.

---
 server/templates/search.html | 2 ++
 src/dataset/mod.rs           | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/server/templates/search.html b/server/templates/search.html
index a4a2ccd389..75000c6355 100644
--- a/server/templates/search.html
+++ b/server/templates/search.html
@@ -92,6 +92,8 @@
       <div>
         <h2><a href="/dataset/{{ result.value.source }}/{{ result.value.id }}" title="Score: {{ result.score }}"">{{ result.value.value.title }}</a></h2>
 
+        <p>Latest date: {{ "{:#?}"|format(result.value.value.latest_date()) }} </p>
+
         {% if let Some(description) = result.value.value.description %} <p>{{ description }}</p> {% endif %}
       </div>
 
diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs
index fc45b7a147..3b3a2e89f6 100644
--- a/src/dataset/mod.rs
+++ b/src/dataset/mod.rs
@@ -347,6 +347,15 @@ impl Dataset {
 
         Ok(())
     }
+
+    pub fn latest_date(&self) -> Option<Date> {
+        self.time_ranges
+            .iter()
+            .map(|time_range| time_range.until)
+            .chain(self.issued)
+            .chain(self.modified)
+            .max()
+    }
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize, ToSchema)]
-- 
GitLab