diff --git a/Cargo.toml b/Cargo.toml index 48e40c6..b29356e 100755 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,8 +7,8 @@ edition = "2021" [dependencies] nom = "7.1.3" -serde_json = "1.0.127" -serde = { version = "1.0.209", features = ["derive"] } +serde_json = "1.0.128" +serde = { version = "1.0.210", features = ["derive"] } log = "0.4.22" lz4_flex = "0.11.3" byteorder = "1.5.0" diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 2eb5d2b..06a1479 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -2,5 +2,6 @@ members = [ "unifiedlog_parser", "unifiedlog_parser_json", - "parse_tracev3", -] \ No newline at end of file + "parse_tracev3", + "unifiedlog_iterator", +] diff --git a/examples/unifiedlog_iterator/Cargo.toml b/examples/unifiedlog_iterator/Cargo.toml new file mode 100644 index 0000000..24372c5 --- /dev/null +++ b/examples/unifiedlog_iterator/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "unifiedlog_iterator" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +simplelog = "0.12.2" +csv = "1.3.0" +chrono = "0.4.38" +log = "0.4.22" +macos-unifiedlogs = {path = "../../"} +clap = {version = "4.5.18", features = ["derive"]} \ No newline at end of file diff --git a/examples/unifiedlog_iterator/src/main.rs b/examples/unifiedlog_iterator/src/main.rs new file mode 100644 index 0000000..bd82e41 --- /dev/null +++ b/examples/unifiedlog_iterator/src/main.rs @@ -0,0 +1,409 @@ +// Copyright 2022 Mandiant, Inc. All Rights Reserved +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and limitations under the License. + +use chrono::{SecondsFormat, TimeZone, Utc}; +use log::LevelFilter; +use macos_unifiedlogs::dsc::SharedCacheStrings; +use macos_unifiedlogs::iterator::UnifiedLogIterator; +use macos_unifiedlogs::parser::{ + build_log, collect_shared_strings, collect_shared_strings_system, collect_strings, + collect_strings_system, collect_timesync, collect_timesync_system, +}; +use macos_unifiedlogs::timesync::TimesyncBoot; +use macos_unifiedlogs::unified_log::{LogData, UnifiedLogData}; +use macos_unifiedlogs::uuidtext::UUIDText; +use simplelog::{Config, SimpleLogger}; +use std::error::Error; +use std::fs::OpenOptions; +use std::io::Write; +use std::path::PathBuf; +use std::{fs, io}; + +use clap::Parser; +use csv::Writer; + +#[derive(Parser, Debug)] +#[clap(version, about, long_about = None)] +struct Args { + /// Run on live system + #[clap(short, long, default_value = "false")] + live: String, + + /// Path to logarchive formatted directory + #[clap(short, long, default_value = "")] + input: String, + + /// Path to output file. Any directories must already exist + #[clap(short, long, default_value = "")] + output: String, +} + +fn main() { + eprintln!("Starting Unified Log parser..."); + // Set logging level to warning + SimpleLogger::init(LevelFilter::Warn, Config::default()) + .expect("Failed to initialize simple logger"); + + let args = Args::parse(); + let mut writer = construct_writer(&args.output).unwrap(); + // Create headers for CSV file + output_header(&mut writer).unwrap(); + + if args.input != "" { + parse_log_archive(&args.input, &mut writer); + } else if args.live != "false" { + parse_live_system(&mut writer); + } +} + +// Parse a provided directory path. Currently, expect the path to follow macOS log collect structure +fn parse_log_archive(path: &str, writer: &mut Writer>) { + let mut archive_path = PathBuf::from(path); + + // Parse all UUID files which contain strings and other metadata + let string_results = collect_strings(&archive_path.display().to_string()).unwrap(); + + archive_path.push("dsc"); + // Parse UUID cache files which also contain strings and other metadata + let shared_strings_results = + collect_shared_strings(&archive_path.display().to_string()).unwrap(); + archive_path.pop(); + + archive_path.push("timesync"); + // Parse all timesync files + let timesync_data = collect_timesync(&archive_path.display().to_string()).unwrap(); + archive_path.pop(); + + // Keep UUID, UUID cache, timesync files in memory while we parse all tracev3 files + // Allows for faster lookups + parse_trace_file( + &string_results, + &shared_strings_results, + ×ync_data, + path, + writer, + ); + + eprintln!("\nFinished parsing Unified Log data."); +} + +// Parse a live macOS system +fn parse_live_system(writer: &mut Writer>) { + let strings = collect_strings_system().unwrap(); + let shared_strings = collect_shared_strings_system().unwrap(); + let timesync_data = collect_timesync_system().unwrap(); + + parse_trace_file( + &strings, + &shared_strings, + ×ync_data, + "/private/var/db/diagnostics", + writer, + ); + + eprintln!("\nFinished parsing Unified Log data."); +} + +// Use the provided strings, shared strings, timesync data to parse the Unified Log data at provided path. +// Currently, expect the path to follow macOS log collect structure +fn parse_trace_file( + string_results: &[UUIDText], + shared_strings_results: &[SharedCacheStrings], + timesync_data: &[TimesyncBoot], + path: &str, + writer: &mut Writer>, +) { + // We need to persist the Oversize log entries (they contain large strings that don't fit in normal log entries) + // Some log entries have Oversize strings located in different tracev3 files. + // This is very rare. Seen in ~20 log entries out of ~700,000. Seen in ~700 out of ~18 million + let mut oversize_strings = UnifiedLogData { + header: Vec::new(), + catalog_data: Vec::new(), + oversize: Vec::new(), + }; + + let mut missing_data: Vec = Vec::new(); + + let mut archive_path = PathBuf::from(path); + archive_path.push("Persist"); + + let mut log_count = 0; + if archive_path.exists() { + let paths = fs::read_dir(&archive_path).unwrap(); + + // Loop through all tracev3 files in Persist directory + for log_path in paths { + let data = log_path.unwrap(); + let full_path = data.path().display().to_string(); + eprintln!("Parsing: {}", full_path); + + if data.path().exists() { + let count = iterate_chunks( + &full_path, + &mut missing_data, + string_results, + shared_strings_results, + timesync_data, + writer, + &mut oversize_strings, + ); + log_count += count; + } else { + eprintln!("File {} no longer on disk", full_path); + continue; + }; + } + } + + archive_path.pop(); + archive_path.push("Special"); + + if archive_path.exists() { + let paths = fs::read_dir(&archive_path).unwrap(); + + // Loop through all tracev3 files in Special directory + for log_path in paths { + let data = log_path.unwrap(); + let full_path = data.path().display().to_string(); + eprintln!("Parsing: {}", full_path); + + if data.path().exists() { + let count = iterate_chunks( + &full_path, + &mut missing_data, + string_results, + shared_strings_results, + timesync_data, + writer, + &mut oversize_strings, + ); + log_count += count; + } else { + eprintln!("File {} no longer on disk", full_path); + continue; + }; + } + } + + archive_path.pop(); + archive_path.push("Signpost"); + + if archive_path.exists() { + let paths = fs::read_dir(&archive_path).unwrap(); + + // Loop through all tracev3 files in Signpost directory + for log_path in paths { + let data = log_path.unwrap(); + let full_path = data.path().display().to_string(); + eprintln!("Parsing: {}", full_path); + + if data.path().exists() { + let count = iterate_chunks( + &full_path, + &mut missing_data, + string_results, + shared_strings_results, + timesync_data, + writer, + &mut oversize_strings, + ); + log_count += count; + } else { + eprintln!("File {} no longer on disk", full_path); + continue; + }; + } + } + archive_path.pop(); + archive_path.push("HighVolume"); + + if archive_path.exists() { + let paths = fs::read_dir(&archive_path).unwrap(); + + // Loop through all tracev3 files in HighVolume directory + for log_path in paths { + let data = log_path.unwrap(); + let full_path = data.path().display().to_string(); + eprintln!("Parsing: {}", full_path); + + if data.path().exists() { + let count = iterate_chunks( + &full_path, + &mut missing_data, + string_results, + shared_strings_results, + timesync_data, + writer, + &mut oversize_strings, + ); + log_count += count; + } else { + eprintln!("File {} no longer on disk", full_path); + continue; + }; + } + } + archive_path.pop(); + + archive_path.push("logdata.LiveData.tracev3"); + + // Check if livedata exists. We only have it if 'log collect' was used + if archive_path.exists() { + eprintln!("Parsing: logdata.LiveData.tracev3"); + + let count = iterate_chunks( + &archive_path.display().to_string(), + &mut missing_data, + string_results, + shared_strings_results, + timesync_data, + writer, + &mut oversize_strings, + ); + log_count += count; + archive_path.pop(); + } + + let include_missing = false; + println!("Oversize cache size: {}", oversize_strings.oversize.len()); + println!("Logs with missing Oversize strings: {}", missing_data.len()); + println!("Checking Oversize cache one more time..."); + + // Since we have all Oversize entries now. Go through any log entries that we were not able to build before + for mut leftover_data in missing_data { + // Add all of our previous oversize data to logs for lookups + leftover_data + .oversize = oversize_strings.oversize.clone(); + + // Exclude_missing = false + // If we fail to find any missing data its probably due to the logs rolling + // Ex: tracev3A rolls, tracev3B references Oversize entry in tracev3A will trigger missing data since tracev3A is gone + let (results, _) = build_log( + &leftover_data, + string_results, + shared_strings_results, + timesync_data, + include_missing, + ); + log_count += results.len(); + + output(&results, writer).unwrap(); + } + eprintln!("Parsed {} log entries", log_count); +} + +fn iterate_chunks( + path: &str, + missing: &mut Vec, + strings_data: &[UUIDText], + shared_strings: &[SharedCacheStrings], + timesync_data: &[TimesyncBoot], + writer: &mut Writer>, + oversize_strings: &mut UnifiedLogData, +) -> usize { + let log_bytes = fs::read(path).unwrap(); + let log_iterator = UnifiedLogIterator { + data: log_bytes, + header: Vec::new(), + }; + + // Exclude missing data from returned output. Keep separate until we parse all oversize entries. + // Then after parsing all logs, go through all missing data and check all parsed oversize entries again + let exclude_missing = true; + + let mut count = 0; + for mut chunk in log_iterator { + chunk.oversize.append(&mut oversize_strings.oversize); + let (results, missing_logs) = build_log( + &chunk, + strings_data, + shared_strings, + timesync_data, + exclude_missing, + ); + count += results.len(); + oversize_strings.oversize = chunk.oversize; + output(&results, writer).unwrap(); + if missing_logs.catalog_data.is_empty() && missing_logs.header.is_empty() && missing_logs.oversize.is_empty() { + continue; + } + // Track possible missing log data due to oversize strings being in another file + missing.push(missing_logs); + } + + count +} + +fn construct_writer(output_path: &str) -> Result>, Box> { + let writer = if output_path != "" { + Box::new( + OpenOptions::new() + .append(true) + .create(true) + .open(output_path)?, + ) as Box + } else { + Box::new(io::stdout()) as Box + }; + Ok(Writer::from_writer(writer)) +} + +// Create csv file and create headers +fn output_header(writer: &mut Writer>) -> Result<(), Box> { + writer.write_record(&[ + "Timestamp", + "Event Type", + "Log Type", + "Subsystem", + "Thread ID", + "PID", + "EUID", + "Library", + "Library UUID", + "Activity ID", + "Category", + "Process", + "Process UUID", + "Message", + "Raw Message", + "Boot UUID", + "System Timezone Name", + ])?; + writer.flush()?; + Ok(()) +} + +// Append or create csv file +fn output( + results: &Vec, + writer: &mut Writer>, +) -> Result<(), Box> { + for data in results { + let date_time = Utc.timestamp_nanos(data.time as i64); + writer.write_record(&[ + date_time.to_rfc3339_opts(SecondsFormat::Millis, true), + data.event_type.to_owned(), + data.log_type.to_owned(), + data.subsystem.to_owned(), + data.thread_id.to_string(), + data.pid.to_string(), + data.euid.to_string(), + data.library.to_owned(), + data.library_uuid.to_owned(), + data.activity_id.to_string(), + data.category.to_owned(), + data.process.to_owned(), + data.process_uuid.to_owned(), + data.message.to_owned(), + data.raw_message.to_owned(), + data.boot_uuid.to_owned(), + data.timezone_name.to_owned(), + ])?; + } + writer.flush()?; + Ok(()) +} diff --git a/examples/unifiedlog_parser/Cargo.toml b/examples/unifiedlog_parser/Cargo.toml index 0175efb..92e3c9c 100644 --- a/examples/unifiedlog_parser/Cargo.toml +++ b/examples/unifiedlog_parser/Cargo.toml @@ -11,4 +11,4 @@ csv = "1.3.0" chrono = "0.4.38" log = "0.4.22" macos-unifiedlogs = {path = "../../"} -clap = {version = "4.5.15", features = ["derive"]} \ No newline at end of file +clap = {version = "4.5.18", features = ["derive"]} \ No newline at end of file diff --git a/examples/unifiedlog_parser/src/main.rs b/examples/unifiedlog_parser/src/main.rs index b0d0a5c..8ce6e9e 100755 --- a/examples/unifiedlog_parser/src/main.rs +++ b/examples/unifiedlog_parser/src/main.rs @@ -311,13 +311,15 @@ fn parse_trace_file( } exclude_missing = false; + println!("Oversize cache size: {}", oversize_strings.oversize.len()); + println!("Logs with missing oversize strings: {}", missing_data.len()); + println!("Checking Oversize cache one more time..."); // Since we have all Oversize entries now. Go through any log entries that we were not able to build before for mut leftover_data in missing_data { // Add all of our previous oversize data to logs for lookups leftover_data - .oversize - .append(&mut oversize_strings.oversize.to_owned()); + .oversize = oversize_strings.oversize.clone(); // Exclude_missing = false // If we fail to find any missing data its probably due to the logs rolling diff --git a/src/iterator.rs b/src/iterator.rs new file mode 100644 index 0000000..24a7df8 --- /dev/null +++ b/src/iterator.rs @@ -0,0 +1,261 @@ +use crate::{ + catalog::CatalogChunk, + header::HeaderChunk, + preamble::LogPreamble, + unified_log::{LogData, UnifiedLogCatalogData, UnifiedLogData}, + util::padding_size, +}; +use log::{error, warn}; +use nom::bytes::complete::take; + +#[derive(Debug, Clone)] +/// Iterator to loop through Chunks in the tracev3 file +pub struct UnifiedLogIterator { + pub data: Vec, + pub header: Vec, +} + +impl Iterator for UnifiedLogIterator { + type Item = UnifiedLogData; + fn next(&mut self) -> Option { + if self.data.is_empty() { + return None; + } + let mut unified_log_data_true = UnifiedLogData { + header: self.header.clone(), + catalog_data: Vec::new(), + oversize: Vec::new(), + }; + + let mut catalog_data = UnifiedLogCatalogData { + catalog: CatalogChunk { + chunk_tag: 0, + chunk_sub_tag: 0, + chunk_data_size: 0, + catalog_subsystem_strings_offset: 0, + catalog_process_info_entries_offset: 0, + number_process_information_entries: 0, + catalog_offset_sub_chunks: 0, + number_sub_chunks: 0, + unknown: Vec::new(), + earliest_firehose_timestamp: 0, + catalog_uuids: Vec::new(), + catalog_subsystem_strings: Vec::new(), + catalog_process_info_entries: Vec::new(), + catalog_subchunks: Vec::new(), + }, + firehose: Vec::new(), + simpledump: Vec::new(), + statedump: Vec::new(), + oversize: Vec::new(), + }; + + let mut input = self.data.as_slice(); + let chunk_preamble_size = 16; // Include preamble size in total chunk size + + let header_chunk = 0x1000; + let catalog_chunk = 0x600b; + let chunkset_chunk = 0x600d; + + loop { + let preamble_result = LogPreamble::detect_preamble(input); + let preamble = match preamble_result { + Ok((_, result)) => result, + Err(_err) => { + error!("Failed to determine preamble chunk"); + return None; + } + }; + let chunk_size = preamble.chunk_data_size; + + // Grab all data associated with Unified Log entry (chunk) + let chunk_result = nom_bytes(input, &(chunk_size + chunk_preamble_size)); + + let (data, chunk_data) = match chunk_result { + Ok(result) => result, + Err(_err) => { + error!("Failed to nom chunk bytes"); + return None; + } + }; + + if preamble.chunk_tag == header_chunk { + LogData::get_header_data(chunk_data, &mut unified_log_data_true); + } else if preamble.chunk_tag == catalog_chunk { + if catalog_data.catalog.chunk_tag != 0 { + self.data = input.to_vec(); + break; + } + + LogData::get_catalog_data(chunk_data, &mut catalog_data); + } else if preamble.chunk_tag == chunkset_chunk { + LogData::get_chunkset_data( + chunk_data, + &mut catalog_data, + &mut unified_log_data_true, + ); + } else { + error!( + "[macos-unifiedlogs] Unknown chunk type: {}", + preamble.chunk_tag + ); + } + + let padding_size = padding_size(preamble.chunk_data_size); + if self.data.len() < padding_size as usize { + self.data = Vec::new(); + break; + } + let data_result = nom_bytes(data, &padding_size); + let data = match data_result { + Ok((result, _)) => result, + Err(_err) => { + error!("Failed to nom log end padding"); + return None; + } + }; + if data.is_empty() { + self.data = Vec::new(); + break; + } + input = data; + if input.len() < chunk_preamble_size as usize { + warn!( + "Not enough data for preamble header, needed 16 bytes. Got: {}", + input.len() + ); + self.data = Vec::new(); + break; + } + } + + // Make sure to get the last catalog + if catalog_data.catalog.chunk_tag != 0 { + unified_log_data_true.catalog_data.push(catalog_data); + } + self.header = unified_log_data_true.header.clone(); + Some(unified_log_data_true) + } +} + +/// Nom bytes of the log chunk +fn nom_bytes<'a>(data: &'a [u8], size: &u64) -> nom::IResult<&'a [u8], &'a [u8]> { + take(*size)(data) +} + +#[cfg(test)] +mod tests { + use super::UnifiedLogIterator; + use crate::{ + iterator::nom_bytes, + parser::{build_log, collect_shared_strings, collect_strings, collect_timesync}, + }; + use std::{fs, path::PathBuf}; + + #[test] + fn test_unified_log_iterator() { + let mut test_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + test_path.push("tests/test_data/system_logs_big_sur.logarchive"); + + test_path.push("Persist/0000000000000002.tracev3"); + let buffer_results = fs::read(test_path.to_str().unwrap()).unwrap(); + + let log_iterator = UnifiedLogIterator { + data: buffer_results, + header: Vec::new(), + }; + + let mut total = 0; + for chunk in log_iterator { + if chunk.catalog_data[0].firehose.len() == 99 { + assert_eq!(chunk.catalog_data[0].firehose.len(), 99); + assert_eq!(chunk.catalog_data[0].simpledump.len(), 0); + assert_eq!(chunk.header.len(), 1); + assert!( + chunk.catalog_data[0] + .catalog + .catalog_process_info_entries + .len() + > 40 + ); + assert_eq!(chunk.catalog_data[0].statedump.len(), 0); + } + + total += chunk.catalog_data.len(); + } + + assert_eq!(total, 56); + } + + #[test] + fn test_unified_log_iterator_build_log() { + let mut test_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + test_path.push("tests/test_data/system_logs_big_sur.logarchive"); + + let string_results = collect_strings(&test_path.display().to_string()).unwrap(); + + test_path.push("dsc"); + let shared_strings_results = + collect_shared_strings(&test_path.display().to_string()).unwrap(); + test_path.pop(); + + test_path.push("timesync"); + let timesync_data = collect_timesync(&test_path.display().to_string()).unwrap(); + test_path.pop(); + + test_path.push("Persist/0000000000000002.tracev3"); + let buffer_results = fs::read(test_path.to_str().unwrap()).unwrap(); + + let log_iterator = UnifiedLogIterator { + data: buffer_results, + header: Vec::new(), + }; + + let mut total = 0; + for chunk in log_iterator { + let exclude_missing = false; + let (results, _) = build_log( + &chunk, + &string_results, + &shared_strings_results, + ×ync_data, + exclude_missing, + ); + + if results[10].time == 1642302327364384800.0 { + assert_eq!(results.len(), 3805); + assert_eq!(results[10].process, "/usr/libexec/lightsoutmanagementd"); + assert_eq!(results[10].subsystem, "com.apple.lom"); + assert_eq!(results[10].time, 1642302327364384800.0); + assert_eq!(results[10].activity_id, 0); + assert_eq!( + results[10].library, + "/System/Library/PrivateFrameworks/AppleLOM.framework/Versions/A/AppleLOM" + ); + assert_eq!(results[10].message, " LOM isSupported : No"); + assert_eq!(results[10].pid, 45); + assert_eq!(results[10].thread_id, 588); + assert_eq!(results[10].category, "device"); + assert_eq!(results[10].log_type, "Default"); + assert_eq!(results[10].event_type, "Log"); + assert_eq!(results[10].euid, 0); + assert_eq!(results[10].boot_uuid, "80D194AF56A34C54867449D2130D41BB"); + assert_eq!(results[10].timezone_name, "Pacific"); + assert_eq!(results[10].library_uuid, "D8E5AF1CAF4F3CEB8731E6F240E8EA7D"); + assert_eq!(results[10].process_uuid, "6C3ADF991F033C1C96C4ADFAA12D8CED"); + assert_eq!(results[10].raw_message, "%@ LOM isSupported : %s"); + } + + total += results.len(); + } + + assert_eq!(total, 207366); + } + + #[test] + fn test_nom_bytes() { + let test = [1, 0, 0, 0]; + let (left, _) = nom_bytes(&test, &1).unwrap(); + assert_eq!(left.len(), 3); + } +} diff --git a/src/lib.rs b/src/lib.rs index 5598c4c..9a4d9f5 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -38,6 +38,7 @@ mod decoders; pub mod dsc; mod error; mod header; +pub mod iterator; mod message; pub mod parser; mod preamble; diff --git a/src/unified_log.rs b/src/unified_log.rs index 985a971..c93892e 100755 --- a/src/unified_log.rs +++ b/src/unified_log.rs @@ -842,7 +842,7 @@ impl LogData { } /// Get the header of the Unified Log data (tracev3 file) - fn get_header_data(data: &[u8], unified_log_data: &mut UnifiedLogData) { + pub(crate) fn get_header_data(data: &[u8], unified_log_data: &mut UnifiedLogData) { let header_results = HeaderChunk::parse_header(data); match header_results { Ok((_, header_data)) => unified_log_data.header.push(header_data), @@ -851,7 +851,7 @@ impl LogData { } /// Get the Catalog of the Unified Log data (tracev3 file) - fn get_catalog_data(data: &[u8], unified_log_data: &mut UnifiedLogCatalogData) { + pub(crate) fn get_catalog_data(data: &[u8], unified_log_data: &mut UnifiedLogCatalogData) { let catalog_results = CatalogChunk::parse_catalog(data); match catalog_results { Ok((_, catalog_data)) => unified_log_data.catalog = catalog_data, @@ -863,7 +863,7 @@ impl LogData { } /// Get the Chunkset of the Unified Log data (tracev3) - fn get_chunkset_data( + pub(crate) fn get_chunkset_data( data: &[u8], catalog_data: &mut UnifiedLogCatalogData, unified_log_data: &mut UnifiedLogData,