Skip to content

Commit

Permalink
number of terms in query as ranking signal
Browse files Browse the repository at this point in the history
has coefficient 0 as it doesn't make sense to include in linear combination, but can still be used in more advanced pipelines
  • Loading branch information
mikkeldenker committed Aug 22, 2024
1 parent 8ec40ab commit acb6509
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 18 deletions.
7 changes: 5 additions & 2 deletions crates/core/src/ranking/core/computer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,10 @@ impl QueryData {
pub fn selected_region(&self) -> Option<crate::webpage::Region> {
self.selected_region
}

pub fn simple_terms(&self) -> &[String] {
&self.simple_terms
}
}

pub struct SignalComputer {
Expand Down Expand Up @@ -228,10 +232,9 @@ impl SignalComputer {
current_timestamp: None,
linear_regression: None,
query_data: query,
order: SignalComputeOrder::empty(),
order: SignalComputeOrder::new(),
};

s.order = SignalComputeOrder::new(&s);
s.set_current_timestamp(chrono::Utc::now().timestamp() as usize);

s
Expand Down
13 changes: 1 addition & 12 deletions crates/core/src/ranking/core/computer/order.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,11 @@ impl SignalComputeOrder {
}
}

pub fn new(signal_computer: &SignalComputer) -> Self {
pub fn new() -> Self {
let mut text_signals = EnumMap::new();
let mut other_signals = Vec::new();

for signal in SignalEnum::all() {
if signal_computer.coefficient(&signal) == 0.0 {
continue;
}

if let Some(text_field) = signal.as_textfield() {
let mono = text_field.monogram_field();

Expand Down Expand Up @@ -76,13 +72,6 @@ impl SignalComputeOrder {
.values()
.flat_map(move |ngram| ngram.compute(doc, signal_computer))
.chain(self.other_signals.iter().map(move |signal| {
if signal_computer.coefficient(signal) == 0.0 {
return ComputedSignal {
signal: *signal,
score: 0.0,
};
}

signal
.compute(doc, signal_computer)
.map(|score| ComputedSignal {
Expand Down
1 change: 1 addition & 0 deletions crates/core/src/ranking/core/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use utoipa::ToSchema;
mod computer;
mod non_text;
mod prelude;
mod query;
mod text;

pub use computer::SignalComputer;
Expand Down
3 changes: 2 additions & 1 deletion crates/core/src/ranking/core/non_text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,8 @@ impl Signal for QueryCentrality {
}

fn compute(&self, _doc: DocId, _signal_computer: &SignalComputer) -> Option<f64> {
unimplemented!()
// unimplemented!()
None
}
}

Expand Down
6 changes: 5 additions & 1 deletion crates/core/src/ranking/core/prelude.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use crate::enum_dispatch_from_discriminant;
// Stract is an open source web search engine.
// Copyright (C) 2024 Stract ApS
//
Expand All @@ -14,6 +13,8 @@ use crate::enum_dispatch_from_discriminant;
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>

use crate::enum_dispatch_from_discriminant;
use crate::enum_map::{GetEnumMapKey, InsertEnumMapKey};

use crate::schema::Field;
Expand All @@ -26,6 +27,7 @@ use enum_dispatch::enum_dispatch;
use strum::{EnumDiscriminants, VariantArray};

use super::non_text::*;
use super::query::*;
use super::text::*;
use super::SignalComputer;
use tantivy::DocId;
Expand Down Expand Up @@ -120,6 +122,7 @@ pub enum SignalEnum {
TitleEmbeddingSimilarity,
KeywordEmbeddingSimilarity,
HasAds,
NumQueryTerms,
}

enum_dispatch_from_discriminant!(SignalEnumDiscriminants => SignalEnum,
Expand Down Expand Up @@ -166,6 +169,7 @@ enum_dispatch_from_discriminant!(SignalEnumDiscriminants => SignalEnum,
TitleEmbeddingSimilarity,
KeywordEmbeddingSimilarity,
HasAds,
NumQueryTerms
]);

impl SignalEnum {
Expand Down
50 changes: 50 additions & 0 deletions crates/core/src/ranking/core/query.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use tantivy::DocId;

use crate::schema::Field;

// Stract is an open source web search engine.
// Copyright (C) 2024 Stract ApS
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.

use super::{Signal, SignalComputer};

#[derive(
Debug,
Clone,
Copy,
PartialEq,
Eq,
Hash,
serde::Serialize,
serde::Deserialize,
bincode::Encode,
bincode::Decode,
)]
pub struct NumQueryTerms;
impl Signal for NumQueryTerms {
fn default_coefficient(&self) -> f64 {
0.0
}

fn as_field(&self) -> Option<Field> {
None
}

fn compute(&self, _: DocId, signal_computer: &SignalComputer) -> Option<f64> {
signal_computer
.query_data()
.map(|d| d.simple_terms().len() as f64)
}
}
2 changes: 0 additions & 2 deletions crates/core/src/webpage/adservers.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1842,7 +1842,6 @@ gloyah.net,
gmads.net,
gml.email,
go-clicks.de,
go-mpulse.net,
go-rank.de,
go.dhs.gov,
go.eu.sparkpostmail1.com,
Expand Down Expand Up @@ -2435,7 +2434,6 @@ nondescriptstocking.com,
nothingunit.com,
novem.pl,
npttech.com,
nr-data.net,
nr.mmcdn.com,
nr.static.mmcdn.com,
ns1p.net,
Expand Down

0 comments on commit acb6509

Please sign in to comment.