Skip to content

Commit

Permalink
Merge pull request #560 from dcSpark/bence/add-page-numbers
Browse files Browse the repository at this point in the history
Add page numbers to text groups
  • Loading branch information
nicarq authored Sep 18, 2024
2 parents b362542 + 81562da commit 8a5d239
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,11 @@ impl ShinkaiFileParser {
(parsed_result.to_string(), serialized_metadata)
}

pub fn parse_and_split_into_text_groups(text: String, max_node_text_size: u64) -> Vec<TextGroup> {
pub fn parse_and_split_into_text_groups(
text: String,
max_node_text_size: u64,
page_number: Option<u32>,
) -> Vec<TextGroup> {
let mut text_groups = Vec::new();
let (parsed_text, metadata, parsed_any_metadata) = ShinkaiFileParser::parse_and_extract_metadata(&text);
let (parsed_md_text, md_metadata) = ShinkaiFileParser::parse_and_extract_md_metadata(&parsed_text);
Expand All @@ -367,11 +371,23 @@ impl ShinkaiFileParser {
let (parsed_chunk, metadata, _) = ShinkaiFileParser::parse_and_extract_metadata(&chunk);
let (parsed_md_chunk, md_metadata) = ShinkaiFileParser::parse_and_extract_md_metadata(&parsed_chunk);
let metadata = metadata.into_iter().chain(md_metadata).collect();
text_groups.push(TextGroup::new(parsed_md_chunk, metadata, vec![], None));
let mut text_group = TextGroup::new(parsed_md_chunk, metadata, vec![], None);

if let Some(page_number) = page_number {
text_group.push_page_number(page_number);
}

text_groups.push(text_group);
}
} else {
let metadata = metadata.into_iter().chain(md_metadata).collect();
text_groups.push(TextGroup::new(parsed_md_text, metadata, vec![], None));
let mut text_group = TextGroup::new(parsed_md_text, metadata, vec![], None);

if let Some(page_number) = page_number {
text_group.push_page_number(page_number);
}

text_groups.push(text_group);
}

text_groups
Expand All @@ -384,9 +400,11 @@ impl ShinkaiFileParser {
depth: usize,
text: String,
max_node_text_size: u64,
page_number: Option<u32>,
) {
if !text.is_empty() {
let created_text_groups = ShinkaiFileParser::parse_and_split_into_text_groups(text, max_node_text_size);
let created_text_groups =
ShinkaiFileParser::parse_and_split_into_text_groups(text, max_node_text_size, page_number);

if depth > 0 {
let mut parent_group = text_groups.last_mut();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,39 +75,44 @@ impl TextGroup {
}

if let Some(page_number) = page_number {
let mut unique_page_numbers: HashSet<u32> = HashSet::new();

if let Some(page_numbers_metadata) = self.metadata.get(&ShinkaiFileParser::page_numbers_metadata_key()) {
let page_numbers_metadata: Result<Vec<u32>, _> = page_numbers_metadata
.trim_matches(|c| c == '[' || c == ']')
.split(",")
.map(|n| n.trim().parse::<u32>())
.collect();

match page_numbers_metadata {
Ok(page_numbers) => {
for page_number in page_numbers {
unique_page_numbers.insert(page_number);
}
self.push_page_number(page_number);
}
}

/// Pushes a page number into this TextGroup
pub fn push_page_number(&mut self, page_number: u32) {
let mut unique_page_numbers: HashSet<u32> = HashSet::new();

if let Some(page_numbers_metadata) = self.metadata.get(&ShinkaiFileParser::page_numbers_metadata_key()) {
let page_numbers_metadata: Result<Vec<u32>, _> = page_numbers_metadata
.trim_matches(|c| c == '[' || c == ']')
.split(",")
.map(|n| n.trim().parse::<u32>())
.collect();

match page_numbers_metadata {
Ok(page_numbers) => {
for page_number in page_numbers {
unique_page_numbers.insert(page_number);
}
Err(_) => {}
}
Err(_) => {}
}

unique_page_numbers.insert(page_number);

self.metadata.insert(
ShinkaiFileParser::page_numbers_metadata_key(),
format!(
"[{}]",
unique_page_numbers
.iter()
.map(|n| n.to_string())
.collect::<Vec<String>>()
.join(", ")
),
);
}

unique_page_numbers.insert(page_number);

self.metadata.insert(
ShinkaiFileParser::page_numbers_metadata_key(),
format!(
"[{}]",
unique_page_numbers
.iter()
.map(|n| n.to_string())
.collect::<Vec<String>>()
.join(", ")
),
);
}

/// Pushes a sub-group into this TextGroup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ impl LocalFileParser {
heading_depth,
current_text.clone(),
max_node_text_size,
None,
);
current_text = "".to_string();

Expand All @@ -108,6 +109,7 @@ impl LocalFileParser {
heading_depth,
text,
max_node_text_size,
None,
);
heading_depth += 1;
return;
Expand All @@ -127,6 +129,7 @@ impl LocalFileParser {
heading_depth,
current_text.clone(),
max_node_text_size,
None,
);
current_text = text;
}
Expand Down Expand Up @@ -156,6 +159,7 @@ impl LocalFileParser {
heading_depth,
current_text.clone(),
max_node_text_size,
None,
);
current_text = "".to_string();

Expand All @@ -165,12 +169,19 @@ impl LocalFileParser {
heading_depth,
table_text,
max_node_text_size,
None,
);
}
_ => {}
});

ShinkaiFileParser::push_text_group_by_depth(&mut text_groups, heading_depth, current_text, max_node_text_size);
ShinkaiFileParser::push_text_group_by_depth(
&mut text_groups,
heading_depth,
current_text,
max_node_text_size,
None,
);

Ok(text_groups)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ impl LocalFileParser {
heading_parents.len(),
node_text.trim().to_owned(),
max_node_text_size,
None,
);
node_text.clear();
}
Expand All @@ -131,6 +132,7 @@ impl LocalFileParser {
heading_parents.len(),
node_text.trim().to_owned(),
max_node_text_size,
None,
);
node_text.clear();

Expand Down Expand Up @@ -229,6 +231,7 @@ impl LocalFileParser {
heading_depth,
inner_text.trim().to_owned(),
max_node_text_size,
None,
);
}
"a" => {
Expand Down Expand Up @@ -284,6 +287,7 @@ impl LocalFileParser {
heading_parents.len(),
inner_text.trim().to_owned(),
max_node_text_size,
None,
);
}
"caption" => {
Expand Down Expand Up @@ -339,6 +343,7 @@ impl LocalFileParser {
heading_parents.len(),
result_text.trim().to_owned(),
max_node_text_size,
None,
);

Ok(text_groups)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ impl LocalFileParser {
heading_parents.len(),
current_text.clone(),
max_node_text_size,
None,
);
current_text = "".to_string();

Expand Down Expand Up @@ -104,6 +105,7 @@ impl LocalFileParser {
heading_depth,
text.to_string(),
max_node_text_size,
None,
);
}
NodeValue::Paragraph => {
Expand Down Expand Up @@ -169,6 +171,7 @@ impl LocalFileParser {
heading_parents.len(),
current_text.clone(),
max_node_text_size,
None,
);
current_text = "".to_string();
}
Expand All @@ -181,6 +184,7 @@ impl LocalFileParser {
heading_parents.len(),
current_text.clone(),
max_node_text_size,
None,
);

Ok(text_groups)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@ impl LocalFileParser {

for page in parsed_pages.into_iter() {
for pdf_text in page.content.into_iter() {
ShinkaiFileParser::push_text_group_by_depth(&mut text_groups, 0, pdf_text.text, max_node_text_size);
ShinkaiFileParser::push_text_group_by_depth(
&mut text_groups,
0,
pdf_text.text,
max_node_text_size,
Some(page.page_number.try_into().unwrap_or_default()),
);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ impl RetrievedNode {
if let Some(metadata) = &self.node.metadata {
if let Some(page_numbers) = metadata.get(&ShinkaiFileParser::page_numbers_metadata_key()) {
if !page_numbers.is_empty() {
return format!("Pgs: {}", page_numbers);
return format!("Pages: {}", page_numbers);
}
}
}
Expand Down

0 comments on commit 8a5d239

Please sign in to comment.