WIP: Add function for detecting BOM

Even though BOM is rarely used these days, I find it useful to be able detect it sometimes. I added new function that detects all the BOMs listed in Wikipedia. Currently UTF-32LE doesn't work for some reason. I'll investigate why.
peelonet · Oct 11, 2024 · 07e10aa · 07e10aa
1 parent 4b6841b
commit 07e10aa
Show file tree

Hide file tree

Showing 4 changed files with 354 additions and 1 deletion.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,7 @@ CMAKE_MINIMUM_REQUIRED(VERSION 3.6)
 
 PROJECT(
   PeeloUnicode
-  VERSION 1.0.0
+  VERSION 1.1.0
   DESCRIPTION "Header only C++ Unicode utilities."
   HOMEPAGE_URL "https://github.com/peelonet/peelo-unicode"
   LANGUAGES CXX

diff --git a/README.md b/README.md
@@ -112,3 +112,64 @@ main()
   }
 }
 ```
+
+## BOM detection
+
+The library provides function for detecting whether an byte string contains
+[byte order mark] or not, and which character encoding it is. Even though use
+of BOM is rare these days, it might sometimes be useful to able to detect it.
+
+List of detected character encodings are:
+
+- [UTF-8]
+- [UTF-16BE][UTF-16]
+- [UTF-16LE][UTF-16]
+- [UTF-32BE][UTF-32]
+- [UTF-32LE][UTF-32]
+- [UTF-7]
+- [UTF-1]
+- [UTF-EBCDIC]
+- [SCSU]
+- [BOCU-1]
+- [GB18030]
+
+[Byte order mark]: https://en.wikipedia.org/wiki/Byte_order_mark
+[UTF-7]: https://en.wikipedia.org/wiki/UTF-7
+[UTF-1]: https://en.wikipedia.org/wiki/UTF-1
+[UTF-EBCDIC]: https://en.wikipedia.org/wiki/UTF-EBCDIC
+[SCSU]: https://en.wikipedia.org/wiki/Standard_Compression_Scheme_for_Unicode
+[BOCU-1]: https://en.wikipedia.org/wiki/Binary_Ordered_Compression_for_Unicode
+[GB18030]: https://en.wikipedia.org/wiki/GB_18030
+
+### Example
+
+```cpp
+#include <fstream>
+#include <iostream>
+#include <peelo/unicode/bom.hpp>
+
+int
+main()
+{
+  char buffer[1024];
+  std::fstream f("file.txt");
+  std::size_t length;
+  peelo::unicode::bom type;
+
+  f.read(buffer, sizeof(buffer));
+  length = f.gcount();
+  f.close();
+
+  if (peelo::unicode::detect_bom(buffer, length, &type))
+  {
+    if (type == peelo::unicode::bom::utf16_be)
+    {
+      std::cout << "File has UTF-16BE BOM." << std::endl;
+    } else {
+      std::cout << "File has some other BOM." << std::endl;
+    }
+  } else {
+    std::cout << "File does not contain BOM." << std::endl;
+  }
+}
+```
diff --git a/include/peelo/unicode/bom.hpp b/include/peelo/unicode/bom.hpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2018-2024, peelo.net
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+namespace peelo::unicode
+{
+  /**
+   * Enumeration of different recognized BOM types.
+   */
+  enum class bom
+  {
+    utf8,
+    utf16_be,
+    utf16_le,
+    utf32_be,
+    utf32_le,
+    utf7,
+    utf1,
+    utf_ebcdic,
+    scsu,
+    bocu_1,
+    gb18030,
+  };
+
+  /**
+   * Tests whether given byte string contains byte order mark or not.
+   *
+   * @param input Byte string to test.
+   * @param length Length of the given byte string.
+   * @param type Pointer to the BOM type. If given, it will be set to the
+   *             detected BOM type.
+   * @return True if given byte string contains byte order mark or not,
+   *         false otherwise.
+   */
+  inline bool
+  detect_bom(const char* input, std::size_t length, bom* type = nullptr)
+  {
+    struct bom_info
+    {
+      const char* bytes;
+      std::size_t length;
+      bom type;
+    };
+    static const std::vector<bom_info> bom_list =
+    {
+      {
+        "\xef\xbb\xbf",
+        3,
+        bom::utf8,
+      },
+      {
+        "\xfe\xff",
+        2,
+        bom::utf16_be,
+      },
+      {
+        "\xff\xfe",
+        2,
+        bom::utf16_le,
+      },
+      {
+        "\x00\x00\xfe\xff",
+        4,
+        bom::utf32_be,
+      },
+      // FIXME: For some reason test cases fail with this one.
+      {
+        "\xff\xfe\x00\x00",
+        4,
+        bom::utf32_le,
+      },
+      {
+        "\x2b\x2f\x76",
+        3,
+        bom::utf7,
+      },
+      {
+        "\xf7\x64\x4c",
+        3,
+        bom::utf1,
+      },
+      {
+        "\xdd\x73\x66\x73",
+        4,
+        bom::utf_ebcdic
+      },
+      {
+        "\x0e\xfe\xff",
+        3,
+        bom::scsu
+      },
+      {
+        "\xfb\xee\x28",
+        3,
+        bom::bocu_1
+      },
+      {
+        "\x84\x31\x95\x33",
+        4,
+        bom::gb18030
+      },
+    };
+    const auto size = bom_list.size();
+
+    for (std::size_t i = 0; i < size; ++i)
+    {
+      const auto& info = bom_list[i];
+
+      if (length < info.length)
+      {
+        continue;
+      }
+      else if (!std::memcmp(input, info.bytes, info.length))
+      {
+        if (type)
+        {
+          *type = info.type;
+        }
+
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  /**
+   * Tests whether given string contains byte order mark or not.
+   *
+   * @param input String to test.
+   * @param type Pointer to the BOM type. If given, it will be set to the
+   *             detected BOM type.
+   * @return True if byte string contains byte order mark or not, false
+   *         otherwise.
+   */
+  inline bool
+  detect_bom(const std::string& input, bom* type = nullptr)
+  {
+    return detect_bom(input.c_str(), input.length(), type);
+  }
+}
diff --git a/test/test_bom.cpp b/test/test_bom.cpp
@@ -0,0 +1,123 @@
+#include <cassert>
+
+#include <peelo/unicode/bom.hpp>
+
+using peelo::unicode::detect_bom;
+using peelo::unicode::bom;
+
+static void
+test_recognized_bom(
+  bom expected_type,
+  const char* input,
+  std::size_t length
+)
+{
+  bom type;
+
+  assert(detect_bom(input, length, &type));
+  assert(type == expected_type);
+}
+
+static void
+test_utf8()
+{
+  test_recognized_bom(bom::utf8, "\xef\xbb\xbf", 3);
+}
+
+static void
+test_utf16_be()
+{
+  test_recognized_bom(bom::utf16_be, "\xfe\xff", 2);
+}
+
+static void
+test_utf16_le()
+{
+  test_recognized_bom(bom::utf16_le, "\xff\xfe", 2);
+}
+
+static void
+test_utf32_be()
+{
+  test_recognized_bom(bom::utf32_be, "\x00\x00\xfe\xff", 4);
+}
+
+#if 0
+static void
+test_utf32_le()
+{
+  const char input[] = { '\xff', '\xfe', '\x00', '\x00' };
+
+  test_recognized_bom(bom::utf32_le, input, 4);
+}
+#endif
+
+static void
+test_utf7()
+{
+  test_recognized_bom(bom::utf7, "\x2b\x2f\x76", 3);
+}
+
+static void
+test_utf1()
+{
+  test_recognized_bom(bom::utf1, "\xf7\x64\x4c", 3);
+}
+
+static void
+test_utf_ebcdic()
+{
+  test_recognized_bom(bom::utf_ebcdic, "\xdd\x73\x66\x73", 4);
+}
+
+static void
+test_scsu()
+{
+  test_recognized_bom(bom::scsu, "\x0e\xfe\xff", 3);
+}
+
+static void
+test_bocu_1()
+{
+  test_recognized_bom(bom::bocu_1, "\xfb\xee\x28", 3);
+}
+
+static void
+test_gb18030()
+{
+  test_recognized_bom(bom::gb18030, "\x84\x31\x95\x33", 4);
+}
+
+static void
+test_unrecognized_bom()
+{
+  assert(!detect_bom("", 0));
+  assert(!detect_bom("a", 1));
+  assert(!detect_bom("a\xef\xbb\xbf", 4));
+  assert(!detect_bom("\x00\xbb\xbf\xef\xbb\xbf", 6));
+}
+
+static void
+test_with_string()
+{
+  assert(detect_bom(std::string("\xef\xbb\xbf")));
+  assert(!detect_bom(std::string("a")));
+}
+
+int
+main()
+{
+  test_utf8();
+  test_utf16_be();
+  test_utf16_le();
+  test_utf32_be();
+  // test_utf32_le();
+  test_utf7();
+  test_utf1();
+  test_utf_ebcdic();
+  test_scsu();
+  test_bocu_1();
+  test_gb18030();
+  test_unrecognized_bom();
+  test_with_string();
+}