From e4bc36fa166f79b114dbbd8152b4ba743297c67e Mon Sep 17 00:00:00 2001 From: cccs-jh <63320703+cccs-jh@users.noreply.github.com> Date: Thu, 2 Feb 2023 18:04:10 -0500 Subject: [PATCH 1/2] Fixing charcodes - removing charcode functionality since it mangles files - todo: add decimal charcode decoding to charcode function (currently empty) - renamed xml_escape to charcode_xml to reflect similarity with charcode methods - refactored charcode methods to use regex.sub instead of regex.findall + bytes.sub - seperated unicode charcodes from hexadecimal charcodes - simplified charcode methods by enforcing correct charcode lengths and better regexes - made charcode_xml more flexible to reflect what xml and html parsers allow. --- deobs.py | 110 ++++++++++++++++++++++--------------------------------- 1 file changed, 43 insertions(+), 67 deletions(-) diff --git a/deobs.py b/deobs.py index 34e45d4..cae865a 100644 --- a/deobs.py +++ b/deobs.py @@ -7,6 +7,7 @@ import os from collections import Counter +from functools import partial from itertools import chain from typing import Callable, Dict, List, Optional, Set, Tuple @@ -42,69 +43,55 @@ def printable_ratio(self, text: bytes) -> float: """ Calcuate the ratio of printable characters to total characters in text """ return float(float(len(text.translate(None, self.BINCHARS))) / float(len(text))) + @staticmethod + def encode_codepoint(codepoint: int) -> bytes: + """ Returns the utf-8 encoding of a unicode codepoint """ + return chr(codepoint).encode('utf-8') + + @staticmethod + def codepoint_sub(match: regex.Match, base: int = 16) -> bytes: + """ Replace method for unicode codepoint regex substitutions. + + Args: + match: The regex match object with the text of the unicode codepoint value as group 1. + base: The base that the unicode codepoint is represented in (defaults to hexadecimal) + Returns: + - The utf-8 byte sequence for the codepoint if it can be decoded. + - The original match text if there is a decoding error. + """ + try: + return DeobfuScripter.encode_codepoint(int(match.group(1), base)) + except ValueError: + return match.group(0) # No replacement if decoding fails + @staticmethod def add1b(s: bytes, k: int) -> bytes: """ Add k to each byte of s """ return bytes([(c + k) & 0xff for c in s]) - def charcode(self, text: bytes) -> Optional[bytes]: + @staticmethod + def charcode(text: bytes) -> Optional[bytes]: """ Replace character codes with the corresponding characters """ - arrayofints = list(filter(lambda n: n < 256, - map(int, regex.findall(r'(\d+)', str(regex.findall(rb'\D{1,2}\d{2,3}', text)))))) - if len(arrayofints) > 20: - output = bytes(arrayofints) - if self.printable_ratio(output) > .75 and (float(len(output)) / float(len(text))) > .10: - # if the output is mostly readable and big enough - return output - - return None + # To do: what decimal encodings exist in scripting languages and how to decode them? @staticmethod def charcode_hex(text: bytes) -> Optional[bytes]: """ Replace hex character codes with the corresponding characters """ - output = text - enc_str = [b'\\u', b'%u', b'\\x', b'0x'] - - for encoding in enc_str: - char_len = [(16, regex.compile(rb'(?:' + regex.escape(encoding) + b'[A-Fa-f0-9]{16}){2,}')), - (8, regex.compile(rb'(?:' + regex.escape(encoding) + b'[A-Fa-f0-9]{8}){2,}')), - (4, regex.compile(rb'(?:' + regex.escape(encoding) + b'[A-Fa-f0-9]{4}){2,}')), - (2, regex.compile(rb'(?:' + regex.escape(encoding) + b'[A-Fa-f0-9]{2}){2,}'))] - - for r in char_len: - hexchars = set(regex.findall(r[1], text)) - - for hex_char in hexchars: - data = hex_char - decoded = b'' - if r[0] == 2: - while data != b'': - decoded += binascii.a2b_hex(data[2:4]) - data = data[4:] - if r[0] == 4: - while data != b'': - decoded += binascii.a2b_hex(data[4:6]) + binascii.a2b_hex(data[2:4]) - data = data[6:] - if r[0] == 8: - while data != b'': - decoded += binascii.a2b_hex(data[8:10]) + binascii.a2b_hex(data[6:8]) + \ - binascii.a2b_hex(data[4:6]) + binascii.a2b_hex(data[2:4]) - data = data[10:] - if r[0] == 16: - while data != b'': - decoded += binascii.a2b_hex(data[16:18]) + binascii.a2b_hex(data[14:16]) + \ - binascii.a2b_hex(data[12:14]) + binascii.a2b_hex(data[10:12]) + \ - binascii.a2b_hex(data[8:10]) + binascii.a2b_hex(data[6:8]) + \ - binascii.a2b_hex(data[4:6]) + binascii.a2b_hex(data[2:4]) - data = data[18:] - - # Remove trailing NULL bytes - final_dec = regex.sub(b'[\x00]*$', b'', decoded) - output = output.replace(hex_char, final_dec) + output = regex.sub(rb'(?i)(?:\\x|0x|%)([a-z0-9]{2})', lambda m: binascii.unhexlify(m.group(1)), text) + return output if output != text else None - if output == text: - return None - return output + @staticmethod + def charcode_unicode(text: bytes) -> Optional[bytes]: + """ Replace unicode character codes with the corresponding utf-8 byte sequence""" + output = regex.sub(rb'(?i)(?:\\u|%u)([a-z0-9]{4})', DeobfuScripter.codepoint_sub, text) + return output if output != text else None + + @staticmethod + def charcode_xml(text: bytes) -> Optional[bytes]: + """ Replace XML escape sequences with the corresponding character """ + output = regex.sub(rb'(?i)&#x([a-z0-9]{1,6};', DeobfuScripter.codepoint_sub, text) + output = regex.sub(rb'&#([0-9]{1,7});', partial(DeobfuScripter.codepoint_sub, base=10), output) + return output if output != text else None @staticmethod def chr_decode(text: bytes) -> Optional[bytes]: @@ -120,16 +107,6 @@ def chr_decode(text: bytes) -> Optional[bytes]: return None return output - @staticmethod - def xml_unescape(text: bytes) -> Optional[bytes]: - """ Replace XML escape sequences with the corresponding character """ - output = text - for hex in regex.findall(rb'(?i)&#x[a-z0-9]{2};', text): - output = output.replace(hex, binascii.unhexlify(hex[3:-1])) - for escape in regex.findall(rb'&#(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2});', text): - output = output.replace(escape, int(escape[2:-1]).to_bytes(1, 'big')) - return output if output != text else None - @staticmethod def string_replace(text: bytes) -> Optional[bytes]: """ Replace calls to replace() with their output """ @@ -497,13 +474,12 @@ def execute(self, request: ServiceRequest) -> None: ('Concat strings', self.concat_strings), ('MSWord macro vars', self.mswordmacro_vars), ('Powershell vars', self.powershell_vars), - ('Charcode hex', self.charcode_hex), - ('XML unescape', self.xml_unescape) + ('Hex Charcodes', self.charcode_hex), + ('Unicode Charcodes', self.charcode_unicode), + ('XML Charcodes', self.charcode_xml) ] second_pass.extend(first_pass) - final_pass: TechniqueList = [ - ('Charcode', self.charcode), - ] + final_pass: TechniqueList = [] code_extracts = [ ('.*html.*', "HTML scripts extraction", self.extract_htmlscript) From 3f7c2cc81f0353151c717805d33836e6448d60f9 Mon Sep 17 00:00:00 2001 From: cccs-jh <63320703+cccs-jh@users.noreply.github.com> Date: Thu, 2 Feb 2023 18:13:59 -0500 Subject: [PATCH 2/2] Using f-string instead of .format --- deobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deobs.py b/deobs.py index cae865a..8644257 100644 --- a/deobs.py +++ b/deobs.py @@ -100,7 +100,7 @@ def chr_decode(text: bytes) -> Optional[bytes]: for fullc, c in regex.findall(rb'(chr[bw]?\(([0-9]{1,3})\))', output, regex.I): # noinspection PyBroadException try: - output = regex.sub(regex.escape(fullc), '"{}"'.format(chr(int(c))).encode('utf-8'), output) + output = regex.sub(regex.escape(fullc), f'"{chr(int(c))}"'.encode('utf-8'), output) except Exception: continue if output == text: