From 9f7894058aedcdc6dc2f62d80081845607b5e94e Mon Sep 17 00:00:00 2001 From: Adam Reichold <adam.reichold@uba.de> Date: Tue, 1 Oct 2024 08:38:05 +0000 Subject: [PATCH 1/2] Expose score weight paramerters to facilitate their calibration. --- benches/search.rs | 4 ++++ server/src/ckan.rs | 4 ++++ server/src/search.rs | 23 ++++++++++++------ src/dataset/mod.rs | 2 ++ src/index/indexer.rs | 22 +++++------------ src/index/mod.rs | 13 ++++++++++ src/index/searcher/mod.rs | 50 ++++++++++++++++++++++++++++++++++++--- src/stats.rs | 2 +- xtask/src/main.rs | 2 +- 9 files changed, 94 insertions(+), 28 deletions(-) diff --git a/benches/search.rs b/benches/search.rs index 999d7c3d0d..5f5cb4f11f 100644 --- a/benches/search.rs +++ b/benches/search.rs @@ -22,6 +22,10 @@ fn main() { black_box(false), black_box(false), black_box(None), + black_box(1.0), + black_box(1.0), + black_box(1.0), + black_box(1.0), black_box(&[]), black_box(100), black_box(0), diff --git a/server/src/ckan.rs b/server/src/ckan.rs index 14e42fcd3e..fd52b3ca47 100644 --- a/server/src/ckan.rs +++ b/server/src/ckan.rs @@ -65,6 +65,10 @@ pub async fn package_search( false, false, None, + 1.0, + 1.0, + 1.0, + 1.0, Default::default(), params.rows, params.start, diff --git a/server/src/search.rs b/server/src/search.rs index 85bf7caeee..8a3430fb62 100644 --- a/server/src/search.rs +++ b/server/src/search.rs @@ -120,6 +120,10 @@ where params.bounding_box_contains, params.bounding_box_spatial_clusters, time_range, + params.popularity_weight.unwrap_or(1.0), + params.quality_weight.unwrap_or(1.0), + params.currentness_rate.unwrap_or(1.0), + params.currentness_weight.unwrap_or(1.0), ¶ms.origin_weights, params.results_per_page, (params.page - 1) * params.results_per_page, @@ -149,6 +153,10 @@ where params.bounding_box_contains, params.bounding_box_spatial_clusters, time_range, + params.popularity_weight.unwrap_or(1.0), + params.quality_weight.unwrap_or(1.0), + params.currentness_rate.unwrap_or(1.0), + params.currentness_weight.unwrap_or(1.0), ¶ms.origin_weights, params.results_per_page, (params.page - 1) * params.results_per_page, @@ -190,13 +198,6 @@ where let mut page = spawn_blocking(move || inner(params, searcher, cache, stats)).await??; - page.related_terms = fetch_similar_terms( - client, - &prometheus.similar_terms_request_duration, - &page.terms, - ) - .await; - Ok(accept.into_response(page)) } @@ -274,6 +275,14 @@ pub struct SearchParams<Q> { /// Upper bound of time range #[serde(default)] time_range_until: Option<Date>, + #[serde(default)] + popularity_weight: Option<f32>, + #[serde(default)] + quality_weight: Option<f32>, + #[serde(default)] + currentness_rate: Option<f32>, + #[serde(default)] + currentness_weight: Option<f32>, /// Enables selectively weighing the given origins /// /// This can be given multiple times and takes the form `<facet>^<weight>` diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index 55ef83656c..fc45b7a147 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -398,6 +398,8 @@ pub struct ScoredDataset { pub value: UniquelyIdentifiedDataset, /// Score of this dataset by the search engine based on the given search parameters pub score: f32, + pub popularity: f64, + pub age: i64, } #[derive(ToSchema)] diff --git a/src/index/indexer.rs b/src/index/indexer.rs index 8b25aeb6f2..32cb8b8f9a 100644 --- a/src/index/indexer.rs +++ b/src/index/indexer.rs @@ -302,25 +302,15 @@ impl Indexer { .map(|time_range| time_range.until) .max(); - let currentness = latest_date.map_or(1.0, |date| { - let age = date_as_scalar(date.min(self.today)) - date_as_scalar(self.today); - - const RATE: f32 = 1.0 / 2173.913; // in one over days - 1.0 + (RATE * age as f32).exp() // maps the range (-∞, 0] to (1, 2] + let age = latest_date.map_or(i32::MIN, |date| { + date_as_scalar(date.min(self.today)) - date_as_scalar(self.today) }); - const QUALITY_WEIGHT: f32 = 0.028; - const POPULARITY_WEIGHT: f32 = 0.024; - const CURRENTNESS_WEIGHT: f32 = 0.089; - const BM25_WEIGHT: f32 = 1.0 - QUALITY_WEIGHT - POPULARITY_WEIGHT - CURRENTNESS_WEIGHT; - const { - assert!(BM25_WEIGHT > 0.0); - } + doc.add_f64(self.fields.popularity, quality as f64); + doc.add_f64(self.fields.quality, popularity as f64); + doc.add_i64(self.fields.age, age as i64); - let inherent_score = quality.powf(QUALITY_WEIGHT / BM25_WEIGHT) - * popularity.powf(POPULARITY_WEIGHT / BM25_WEIGHT) - * currentness.powf(CURRENTNESS_WEIGHT / BM25_WEIGHT) - * dataset.quality.accessibility.landing_page.inherent_score() + let inherent_score = dataset.quality.accessibility.landing_page.inherent_score() * dataset.status.inherent_score(); doc.add_f64(self.fields.inherent_score, inherent_score as f64); diff --git a/src/index/mod.rs b/src/index/mod.rs index 5ba6bc0c03..c6465892e8 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -90,6 +90,10 @@ fn schema() -> Schema { schema.add_text_field("cas_rn", STRING); + schema.add_f64_field("popularity", FAST); + schema.add_f64_field("quality", FAST); + schema.add_i64_field("age", FAST); + schema.add_f64_field("inherent_score", FAST); schema.add_bool_field("eligible_for_random", INDEXED); @@ -133,6 +137,9 @@ struct Fields { watershed_id: Field, cas_rn: Field, inherent_score: Field, + popularity: Field, + quality: Field, + age: Field, eligible_for_random: Field, bounding_box_min_x: Field, bounding_box_min_y: Field, @@ -183,6 +190,9 @@ impl Fields { let cas_rn = schema.get_field("cas_rn").unwrap(); let inherent_score = schema.get_field("inherent_score").unwrap(); + let popularity = schema.get_field("popularity").unwrap(); + let quality = schema.get_field("quality").unwrap(); + let age = schema.get_field("age").unwrap(); let eligible_for_random = schema.get_field("eligible_for_random").unwrap(); let bounding_box_min_x = schema.get_field("bounding_box_min_x").unwrap(); @@ -222,6 +232,9 @@ impl Fields { watershed_id, cas_rn, inherent_score, + popularity, + quality, + age, eligible_for_random, bounding_box_min_x, bounding_box_min_y, diff --git a/src/index/searcher/mod.rs b/src/index/searcher/mod.rs index 9aa93f642f..832aba4bff 100644 --- a/src/index/searcher/mod.rs +++ b/src/index/searcher/mod.rs @@ -155,13 +155,17 @@ impl Searcher { bounding_box_contains: bool, bounding_box_spatial_clusters: bool, time_range: Option<TimeRange>, + popularity_weight: f32, + quality_weight: f32, + currentness_rate: f32, + currentness_weight: f32, origin_weights: &[FacetWeight], limit: usize, offset: usize, ) -> Result<SearchResults> { let searcher = self.reader.searcher(); - let key = if offset == 0 { + let key = if false { let key = search_key( query, types_root, @@ -293,7 +297,13 @@ impl Searcher { Count, TopDocs::with_limit(limit) .and_offset(offset) - .tweak_score(ScoreTweaker { origin_weights }), + .tweak_score(ScoreTweaker { + popularity_weight, + quality_weight, + currentness_rate, + currentness_weight, + origin_weights, + }), ( types, topics, @@ -306,11 +316,20 @@ impl Searcher { let results = docs .into_iter() .map(|(score, doc)| { + let fast_fields = searcher.segment_reader(doc.segment_ord).fast_fields(); + let popularity = fast_fields.f64("popularity")?.first(doc.doc_id).unwrap(); + let age = fast_fields.i64("age")?.first(doc.doc_id).unwrap(); + let doc = searcher.doc(doc)?; let value = extract_fields(&self.fields, doc)?; - Ok(ScoredDataset { value, score }) + Ok(ScoredDataset { + value, + score, + popularity, + age, + }) }) .collect::<Result<_>>()?; @@ -776,6 +795,10 @@ fn last_term(ast: UserInputAst) -> Option<(String, Option<String>)> { } struct ScoreTweaker<'a> { + popularity_weight: f32, + quality_weight: f32, + currentness_rate: f32, + currentness_weight: f32, origin_weights: &'a [FacetWeight], } @@ -817,12 +840,22 @@ impl TantivyScoreTweaker<Score> for ScoreTweaker<'_> { fn segment_tweaker(&self, reader: &SegmentReader) -> TantivyResult<Self::Child> { let fast_fields = reader.fast_fields(); let inherent_score = fast_fields.f64("inherent_score")?.first_or_default_col(1.0); + let popularity = fast_fields.f64("popularity")?.first_or_default_col(1.0); + let quality = fast_fields.f64("quality")?.first_or_default_col(1.0); + let age = fast_fields.i64("age")?.first_or_default_col(0); let origins = fast_fields.str("origin")?.unwrap(); let origin_weights = self.resolve_origin_weights(&origins)?; Ok(ScoreSegmentTweaker { inherent_score, + popularity_weight: self.popularity_weight, + popularity, + quality_weight: self.quality_weight, + quality, + currentness_rate: self.currentness_rate, + currentness_weight: self.currentness_weight, + age, origins, origin_weights, }) @@ -831,6 +864,13 @@ impl TantivyScoreTweaker<Score> for ScoreTweaker<'_> { struct ScoreSegmentTweaker { inherent_score: Arc<dyn ColumnValues<f64>>, + popularity_weight: f32, + popularity: Arc<dyn ColumnValues<f64>>, + quality_weight: f32, + quality: Arc<dyn ColumnValues<f64>>, + currentness_rate: f32, + currentness_weight: f32, + age: Arc<dyn ColumnValues<i64>>, origins: StrColumn, origin_weights: Vec<Score>, } @@ -838,6 +878,10 @@ struct ScoreSegmentTweaker { impl TantivyScoreSegementTweaker<Score> for ScoreSegmentTweaker { fn score(&mut self, doc: DocId, mut score: Score) -> f32 { score *= self.inherent_score.get_val(doc) as f32; + score *= (self.popularity.get_val(doc) as f32).powf(self.popularity_weight); + score *= (self.quality.get_val(doc) as f32).powf(self.quality_weight); + score *= (1.0 + (self.currentness_rate * self.age.get_val(doc) as f32).exp()) + .powf(self.currentness_weight); for origin in self.origins.term_ords(doc) { score *= self.origin_weights[origin as usize]; diff --git a/src/stats.rs b/src/stats.rs index 6f932c0fa4..4914ca23be 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -107,7 +107,7 @@ impl Stats { let mut buf = { let mut this = this.lock(); - this.decay(metrics); + // this.decay(metrics); options().serialize(&*this)? }; diff --git a/xtask/src/main.rs b/xtask/src/main.rs index aeb24fae6a..ceec3f2b05 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -109,7 +109,7 @@ fn indexer() -> Result<()> { fn server() -> Result<()> { cargo( "Server", - &["run", "--package=server", "--bin=server"], + &["run", "--release", "--package=server", "--bin=server"], &[], &[ ("DATA_PATH", "data"), -- GitLab From 652795d9ca8d450fbd32fda20be7bff0d09f3221 Mon Sep 17 00:00:00 2001 From: Adam Reichold <adam.reichold@uba.de> Date: Wed, 5 Feb 2025 10:46:27 +0000 Subject: [PATCH 2/2] Show latest date on search result view for debugging our ranking. --- server/templates/search.html | 2 ++ src/dataset/mod.rs | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/server/templates/search.html b/server/templates/search.html index a4a2ccd389..75000c6355 100644 --- a/server/templates/search.html +++ b/server/templates/search.html @@ -92,6 +92,8 @@ <div> <h2><a href="/dataset/{{ result.value.source }}/{{ result.value.id }}" title="Score: {{ result.score }}"">{{ result.value.value.title }}</a></h2> + <p>Latest date: {{ "{:#?}"|format(result.value.value.latest_date()) }} </p> + {% if let Some(description) = result.value.value.description %} <p>{{ description }}</p> {% endif %} </div> diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index fc45b7a147..3b3a2e89f6 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -347,6 +347,15 @@ impl Dataset { Ok(()) } + + pub fn latest_date(&self) -> Option<Date> { + self.time_ranges + .iter() + .map(|time_range| time_range.until) + .chain(self.issued) + .chain(self.modified) + .max() + } } #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize, ToSchema)] -- GitLab