From 5acecb3a52ab43f9cbf2b63367d1bb12fa99c7a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96mer=20Sinan=20A=C4=9Facan?= <omeragacan@gmail.com>
Date: Fri, 24 Sep 2021 10:53:38 +0300
Subject: [PATCH] Fix unicode handling

Fixes some of the issues reported in #5, but not all
---
 src/main.rs             | 40 +++++++++++++++++++++++++++-------------
 src/tests.rs            | 19 +++++++++++++++++++
 test_files/issue_5_1.ml |  6 ++++++
 3 files changed, 52 insertions(+), 13 deletions(-)
 create mode 100644 test_files/issue_5_1.ml

diff --git a/src/main.rs b/src/main.rs
index bc24782..ba5f8ad 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -268,23 +268,35 @@ fn walk_ast<W: Write>(
     }
 }
 
-fn get_token_line_col(token: &str, column0: usize, idx: usize) -> (usize, usize) {
-    let mut chars = token.chars();
+fn get_token_line_col(token: &str, column0: usize, mut byte_idx: usize) -> (usize, usize, usize) {
+    let mut chars = token.chars().peekable();
 
     let mut line = 0;
     let mut col = column0;
-
-    for _ in 0..idx {
-        let c = chars.next();
-        if c == Some('\n') {
+    let mut col_byte_idx = 0;
+
+    while byte_idx != 0 {
+        let c = chars.next().unwrap();
+        byte_idx -= c.len_utf8();
+        if c == '\r' {
+            if let Some('\n') = chars.peek() {
+                let _ = chars.next(); // consume '\n'
+                byte_idx -= '\n'.len_utf8();
+            }
+            line += 1;
+            col = 0;
+            col_byte_idx = 0;
+        } else if c == '\n' {
             line += 1;
             col = 0;
+            col_byte_idx = 0;
         } else {
             col += 1;
+            col_byte_idx += c.len_utf8();
         }
     }
 
-    (line, col)
+    (line, col, col_byte_idx)
 }
 
 fn check_word_bounds(text: &str, match_begin: usize, match_end: usize) -> bool {
@@ -303,6 +315,7 @@ fn check_word_bounds(text: &str, match_begin: usize, match_end: usize) -> bool {
     true
 }
 
+/// Returns byte indices of matches of `pattern` in `token`
 fn match_token(
     token: &str,
     pattern: &str,
@@ -343,6 +356,7 @@ fn match_token(
         .collect()
 }
 
+// NB. `match_byte_idx` is a byte index to `token_str`
 fn report_match<W: Write>(
     stdout: &mut W,
     cfg: &Cfg,
@@ -350,16 +364,16 @@ fn report_match<W: Write>(
     node: &Node,
     token_str: &str,
     lines: &[&str],
-    match_: usize,
+    match_byte_idx: usize,
     header_printed: &mut bool,
     first: &mut bool,
 ) {
     let pos = node.start_position();
 
-    let (token_line, token_col) = get_token_line_col(token_str.as_ref(), pos.column, match_);
+    let (token_line, column, column_byte) =
+        get_token_line_col(token_str, pos.column, match_byte_idx);
 
     let line = pos.row + token_line;
-    let column = token_col;
 
     // Print header (if grouping)
     if !*header_printed && cfg.group {
@@ -429,9 +443,9 @@ fn report_match<W: Write>(
         }
     };
 
-    let before_match = &line[0..column];
-    let match_ = &line[column..column + cfg.pattern.len()];
-    let after_match = &line[column + cfg.pattern.len()..];
+    let before_match = &line[0..column_byte];
+    let match_ = &line[column_byte..column_byte + cfg.pattern.len()];
+    let after_match = &line[column_byte + cfg.pattern.len()..];
     let _ = write!(stdout, "{}", before_match);
     if cfg.color {
         let _ = write!(
diff --git a/src/tests.rs b/src/tests.rs
index 713a3b4..78ed7e4 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -81,3 +81,22 @@ fn simple_word_id() {
          1:fn test() {\n"
     );
 }
+
+#[test]
+fn issue_5_1() {
+    let str = run_args(&[
+        "sg",
+        "--ocaml",
+        "-k",
+        "identifier",
+        "1",
+        "test_files/issue_5_1.ml",
+        "--nocolor",
+    ]);
+
+    assert_eq!(
+        str,
+        "test_files/issue_5_1.ml\n\
+         4:1\n"
+    );
+}
diff --git a/test_files/issue_5_1.ml b/test_files/issue_5_1.ml
new file mode 100644
index 0000000..c2df7ff
--- /dev/null
+++ b/test_files/issue_5_1.ml
@@ -0,0 +1,6 @@
+(*
+¹
+¹
+1
+
+*)