From 586568453fa1145351a88ff900a614f1105e32e7 Mon Sep 17 00:00:00 2001 From: Shivani Bhardwaj Date: Thu, 9 Jan 2025 12:18:24 +0530 Subject: [PATCH] datasets: move initial file reading to rust In a recent warning reported by scan-build, datasets were found to be using a blocking call in a critical section. datasets.c:187:12: warning: Call to blocking function 'fgets' inside of critical section [unix.BlockInCriticalSection] 187 | while (fgets(line, (int)sizeof(line), fp) != NULL) { | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ datasets.c:292:12: warning: Call to blocking function 'fgets' inside of critical section [unix.BlockInCriticalSection] 292 | while (fgets(line, (int)sizeof(line), fp) != NULL) { | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ datasets.c:368:12: warning: Call to blocking function 'fgets' inside of critical section [unix.BlockInCriticalSection] 368 | while (fgets(line, (int)sizeof(line), fp) != NULL) { | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ datasets.c:442:12: warning: Call to blocking function 'fgets' inside of critical section [unix.BlockInCriticalSection] 442 | while (fgets(line, (int)sizeof(line), fp) != NULL) { | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ datasets.c:512:12: warning: Call to blocking function 'fgets' inside of critical section [unix.BlockInCriticalSection] 512 | while (fgets(line, (int)sizeof(line), fp) != NULL) { | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 5 warnings generated. These calls are blocking in the multi tenant mode where several tenants may be trying to load the same dataset in parallel. In a single tenant mode, this operation is performed as a part of a single thread before the engine startup. In order to evade the warning and simplify the code, the initial file reading is moved to Rust with this commit with a much simpler handling of dataset and datarep. Bug 7398 --- rust/cbindgen.toml | 1 + rust/src/detect/datasets.rs | 151 ++++++++++++++++++++++++++++++++++++ rust/src/detect/mod.rs | 1 + src/datasets-reputation.h | 4 +- src/datasets.c | 75 ++---------------- src/datasets.h | 1 + 6 files changed, 162 insertions(+), 71 deletions(-) create mode 100644 rust/src/detect/datasets.rs diff --git a/rust/cbindgen.toml b/rust/cbindgen.toml index eac6aa737760..e2730789c808 100644 --- a/rust/cbindgen.toml +++ b/rust/cbindgen.toml @@ -84,6 +84,7 @@ include = [ "FtpEvent", "SCSigTableElmt", "SCTransformTableElmt", + "DataRepType", ] # A list of items to not include in the generated bindings diff --git a/rust/src/detect/datasets.rs b/rust/src/detect/datasets.rs new file mode 100644 index 000000000000..0e12da502b0d --- /dev/null +++ b/rust/src/detect/datasets.rs @@ -0,0 +1,151 @@ +/* Copyright (C) 2025 Open Information Security Foundation + * + * You can copy, redistribute or modify this Program under the terms of + * the GNU General Public License version 2 as published by the Free + * Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +// Author: Shivani Bhardwaj + +//! This module exposes items from the datasets C code to Rust. + +use crate::common::CallbackFatalErrorOnInit; +use base64::{self, Engine}; +use std::ffi::{c_char, CStr, CString}; +use std::fs::{File, OpenOptions}; +use std::io::{self, BufRead}; +use std::path::Path; + +/// Opaque Dataset type defined in C +#[derive(Copy, Clone)] +pub enum Dataset {} + +// Simple C type converted to Rust +#[derive(Debug, PartialEq)] +#[repr(C)] +pub struct DataRepType { + pub value: u16, +} + +// Extern fns operating on the opaque Dataset type above +/// cbindgen:ignore +extern "C" { + pub fn DatasetAdd(set: &Dataset, data: *const u8, len: u32) -> i32; + pub fn DatasetAddwRep(set: &Dataset, data: *const u8, len: u32, rep: *const DataRepType) + -> i32; +} + +#[no_mangle] +pub unsafe extern "C" fn ProcessStringDatasets( + set: &Dataset, name: *const c_char, fname: *const c_char, fmode: *const c_char, +) -> i32 { + let file_string = unwrap_or_return!(CStr::from_ptr(fname).to_str(), -2); + let mode = unwrap_or_return!(CStr::from_ptr(fmode).to_str(), -2); + let set_name = unwrap_or_return!(CStr::from_ptr(name).to_str(), -2); + let filename = Path::new(file_string); + let mut is_dataset = false; + let mut is_datarep = false; + if let Ok(lines) = read_lines(filename, mode) { + for line in lines.map_while(Result::ok) { + let v: Vec<&str> = line.split(',').collect(); + // Ignore empty and invalid lines in dataset/rep file + if v.is_empty() || v.len() > 2 { + continue; + } + if v.len() == 1 { + if is_datarep { + SCLogError!( + "Cannot mix dataset and datarep values for set {} in {}", + set_name, + filename.display() + ); + return -2; + } + is_dataset = true; + // Dataset + let mut decoded: Vec = vec![]; + if base64::engine::general_purpose::STANDARD + .decode_vec(v[0], &mut decoded) + .is_err() + { + let msg = CString::new(format!( + "bad base64 encoding {} in {}", + set_name, + filename.display() + )) + .unwrap(); + CallbackFatalErrorOnInit(msg.as_ptr()); + continue; + } + DatasetAdd(set, decoded.as_ptr(), decoded.len() as u32); + } else { + if is_dataset { + SCLogError!( + "Cannot mix dataset and datarep values for set {} in {}", + set_name, + filename.display() + ); + return -2; + } + // Datarep + is_datarep = true; + let mut decoded: Vec = vec![]; + if base64::engine::general_purpose::STANDARD + .decode_vec(v[0], &mut decoded) + .is_err() + { + let msg = CString::new(format!( + "bad base64 encoding {} in {}", + set_name, + filename.display() + )) + .unwrap(); + CallbackFatalErrorOnInit(msg.as_ptr()); + continue; + } + if let Ok(val) = v[1].to_string().parse::() { + let rep: DataRepType = DataRepType { value: val }; + DatasetAddwRep(set, decoded.as_ptr(), decoded.len() as u32, &rep); + } else { + let msg = CString::new(format!( + "invalid datarep value {} in {}", + set_name, + filename.display() + )) + .unwrap(); + CallbackFatalErrorOnInit(msg.as_ptr()); + continue; + } + } + } + } else { + return -1; + } + 0 +} + +fn read_lines

(filename: P, fmode: &str) -> io::Result>> +where + P: AsRef, +{ + let file: File = if fmode == "r" { + File::open(filename)? + } else { + OpenOptions::new() + .append(true) + .create(true) + .read(true) + .open(filename)? + }; + Ok(io::BufReader::new(file).lines()) +} diff --git a/rust/src/detect/mod.rs b/rust/src/detect/mod.rs index 1857c22ee2b2..5ade8bf5d9b1 100644 --- a/rust/src/detect/mod.rs +++ b/rust/src/detect/mod.rs @@ -29,6 +29,7 @@ pub mod transforms; pub mod uint; pub mod uri; pub mod tojson; +pub mod datasets; use crate::core::AppProto; use std::os::raw::{c_int, c_void}; diff --git a/src/datasets-reputation.h b/src/datasets-reputation.h index 3483d823cd6e..18ced6803705 100644 --- a/src/datasets-reputation.h +++ b/src/datasets-reputation.h @@ -24,9 +24,7 @@ #ifndef SURICATA_DATASETS_REPUTATION_H #define SURICATA_DATASETS_REPUTATION_H -typedef struct DataRepType { - uint16_t value; -} DataRepType; +#include "rust-bindings.h" typedef struct DataRepResultType { bool found; diff --git a/src/datasets.c b/src/datasets.c index 402c7d34fe99..f1f27e9cb164 100644 --- a/src/datasets.c +++ b/src/datasets.c @@ -44,8 +44,7 @@ SCMutex sets_lock = SCMUTEX_INITIALIZER; static Dataset *sets = NULL; static uint32_t set_ids = 0; -static int DatasetAddwRep(Dataset *set, const uint8_t *data, const uint32_t data_len, - DataRepType *rep); +int DatasetAddwRep(Dataset *set, const uint8_t *data, const uint32_t data_len, DataRepType *rep); static inline void DatasetUnlockData(THashData *d) { @@ -496,80 +495,21 @@ static int DatasetLoadString(Dataset *set) return 0; SCLogConfig("dataset: %s loading from '%s'", set->name, set->load); + const char *fopen_mode = "r"; if (strlen(set->save) > 0 && strcmp(set->save, set->load) == 0) { fopen_mode = "a+"; } - FILE *fp = fopen(set->load, fopen_mode); - if (fp == NULL) { - SCLogError("fopen '%s' failed: %s", set->load, strerror(errno)); + int retval = ProcessStringDatasets(set, set->name, set->load, fopen_mode); + if (retval == -2) { + FatalErrorOnInit("dataset %s could not be processed", set->name); + } else if (retval == -1) { return -1; } - uint32_t cnt = 0; - char line[1024]; - while (fgets(line, (int)sizeof(line), fp) != NULL) { - if (strlen(line) <= 1) - continue; - - char *r = strchr(line, ','); - if (r == NULL) { - line[strlen(line) - 1] = '\0'; - SCLogDebug("line: '%s'", line); - uint32_t decoded_size = Base64DecodeBufferSize(strlen(line)); - // coverity[alloc_strlen : FALSE] - uint8_t decoded[decoded_size]; - uint32_t num_decoded = - Base64Decode((const uint8_t *)line, strlen(line), Base64ModeStrict, decoded); - if (num_decoded == 0 && strlen(line) > 0) { - FatalErrorOnInit("bad base64 encoding %s/%s", set->name, set->load); - continue; - } - - if (DatasetAdd(set, (const uint8_t *)decoded, num_decoded) < 0) { - FatalErrorOnInit("dataset data add failed %s/%s", set->name, set->load); - continue; - } - cnt++; - } else { - line[strlen(line) - 1] = '\0'; - SCLogDebug("line: '%s'", line); - - *r = '\0'; - - uint32_t decoded_size = Base64DecodeBufferSize(strlen(line)); - uint8_t decoded[decoded_size]; - uint32_t num_decoded = - Base64Decode((const uint8_t *)line, strlen(line), Base64ModeStrict, decoded); - if (num_decoded == 0) { - FatalErrorOnInit("bad base64 encoding %s/%s", set->name, set->load); - continue; - } - - r++; - SCLogDebug("r '%s'", r); - - DataRepType rep = { .value = 0 }; - if (ParseRepLine(r, strlen(r), &rep) < 0) { - FatalErrorOnInit("die: bad rep"); - continue; - } - SCLogDebug("rep %u", rep.value); - - if (DatasetAddwRep(set, (const uint8_t *)decoded, num_decoded, &rep) < 0) { - FatalErrorOnInit("dataset data add failed %s/%s", set->name, set->load); - continue; - } - cnt++; - - SCLogDebug("line with rep %s, %s", line, r); - } - } THashConsolidateMemcap(set->hash); - fclose(fp); - SCLogConfig("dataset: %s loaded %u records", set->name, cnt); return 0; } @@ -1572,8 +1512,7 @@ int DatasetAdd(Dataset *set, const uint8_t *data, const uint32_t data_len) return -1; } -static int DatasetAddwRep(Dataset *set, const uint8_t *data, const uint32_t data_len, - DataRepType *rep) +int DatasetAddwRep(Dataset *set, const uint8_t *data, const uint32_t data_len, DataRepType *rep) { if (set == NULL) return -1; diff --git a/src/datasets.h b/src/datasets.h index 86bfed02b22f..1abfa889baa6 100644 --- a/src/datasets.h +++ b/src/datasets.h @@ -19,6 +19,7 @@ #define SURICATA_DATASETS_H #include "util-thash.h" +#include "rust.h" #include "datasets-reputation.h" int DatasetsInit(void);