Storyyeller · abc4857 · Sep 6, 2017
diff --git a/README.md b/README.md
@@ -40,13 +40,11 @@ By default, Enjarify will refuse to overwrite the output file if it already exis
 
 ### Why not dex2jar?
 
-Dex2jar is an older tool that also tries to translate Dalvik to Java bytecode. It works reasonable well most of the time, but a lot of obscure features or edge cases will cause it to fail or even silently produce incorrect results. By contrast, Enjarify is designed to work in as many cases as possible, even for code where Dex2jar would fail. Among other things, Enjarify correctly handles unicode class names, constants used as multiple types, implicit casts, exception handlers jumping into normal control flow, classes that reference too many constants, very long methods, exception handlers after a catchall handler, and static initial values of the wrong type.
+Dex2jar is an older tool that also tries to translate Dalvik to Java bytecode. It works reasonable well most of the time, but a lot of obscure features or edge cases will cause it to fail or even silently produce incorrect results. By contrast, Enjarify is designed to work in as many cases as possible, even for code where Dex2jar would fail. Among other things, Enjarify correctly handles unicode class names, constants used as multiple types, implicit casts, exception handlers jumping into normal control flow, classes that reference too many constants, very long methods, exception handlers after a catchall handler, and static initial values of the wrong type. Enjarify can also translate optional metadata such as sourcefile attributes, line numbers, and annotations.
 
 
 ### Limitations
 
-Enjarify does not currently translate optional metadata such as sourcefile attributes, line numbers, and annotations.
-
 Enjarify tries hard to successfully translate as many classes as possible, but there are some potential cases where it is simply not possible due to limitations in Android, Java, or both. Luckily, this only happens in contrived circumstances, so it shouldn't be a problem in practice.
 
 

diff --git a/enjarify/byteio.py b/enjarify/byteio.py
@@ -54,6 +54,7 @@ def _leb128(self, signed=False):
 
     def uleb128(self): return self._leb128()
     def sleb128(self): return self._leb128(signed=True)
+    def uleb128p1(self): return self._leb128() - 1
 
     # Maintain strings in binary encoding instead of attempting to decode them
     # since the output will be using the same encoding anyway

diff --git a/enjarify/debug.py b/enjarify/debug.py
@@ -0,0 +1,66 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DBG_END_SEQUENCE = 0x00
+DBG_ADVANCE_PC = 0x01
+DBG_ADVANCE_LINE = 0x02
+DBG_START_LOCAL = 0x03
+DBG_START_LOCAL_EXTENDED = 0x04
+DBG_END_LOCAL = 0x05
+DBG_RESTART_LOCAL = 0x06
+DBG_SET_PROLOGUE_END = 0x07
+DBG_SET_EPILOGUE_BEGIN = 0x08
+DBG_SET_FILE = 0x09
+DBG_FIRST_SPECIAL = 0x0A
+
+class DebugInstruction:
+    def __init__(self, dex, stream):
+        self.opcode = stream.u8()
+        if self.opcode == DBG_ADVANCE_PC:
+          self.addr_diff = stream.uleb128()
+        elif self.opcode == DBG_ADVANCE_LINE:
+          self.line_diff = stream.sleb128()
+        elif self.opcode == DBG_START_LOCAL:
+          self.register_num = stream.uleb128()
+          self.name_idx = stream.uleb128p1()
+          self.type_idx = stream.uleb128p1()
+
+          self.name = dex.string(self.name_idx)
+          self.type = dex.type(self.type_idx)
+        elif self.opcode == DBG_START_LOCAL_EXTENDED:
+          self.register_num = stream.uleb128()
+          self.name_idx = stream.uleb128p1()
+          self.type_idx = stream.uleb128p1()
+          self.sig_idx = stream.uleb128p1()
+
+          self.name = dex.string(self.name_idx)
+          self.type = dex.type(self.type_idx)
+          self.sig = dex.string(self.sig_idx)
+        elif self.opcode == DBG_END_LOCAL:
+          self.register_num = stream.uleb128()
+        elif self.opcode == DBG_RESTART_LOCAL:
+          self.register_num = stream.uleb128()
+        elif self.opcode == DBG_SET_FILE:
+          self.name_idx = stream.uleb128p1()
+
+          self.name = dex.string(self.name_idx)
+
+def parseDebugInfo(dex, stream):
+    ops = []
+    while 1:
+        op = DebugInstruction(dex, stream)
+        ops.append(op)
+        if op.opcode == DBG_END_SEQUENCE:
+          break
+    return ops
diff --git a/enjarify/jvm/optimization/jumps.py b/enjarify/jvm/optimization/jumps.py
@@ -55,15 +55,24 @@ def optimizeJumps(irdata):
         assert ins.min <= ins.max
         ins.max = ins.min
 
-def createBytecode(irdata, opts):
+def createBytecode(irdata):
+    assert len(irdata.pos_map) == len(irdata.flat_instructions)
     instrs = irdata.flat_instructions
     posd, end_pos = _calcMinimumPositions(instrs)
+
+    pos_bytecode_map = [] # map each dalvik position -> ir -> to a bytecode offset
+    last_dalvik_pos = -1
 
     bytecode = bytearray()
-    for ins in instrs:
+    for ins, dalvik_pos in zip(instrs, irdata.pos_map):
         if isinstance(ins, (ir.LazyJumpBase, ir.Switch)):
             ins.calcBytecode(posd, irdata.labels)
         bytecode += ins.bytecode
+        if dalvik_pos != last_dalvik_pos:
+            pos_bytecode_map += [len(bytecode)] * (dalvik_pos - last_dalvik_pos)
+            last_dalvik_pos = dalvik_pos
+            #print(str(len(bytecode)))
+    assert len(pos_bytecode_map) == max(irdata.pos_map) + 1
     assert len(bytecode) == end_pos
 
 
@@ -97,4 +106,4 @@ def createBytecode(irdata, opts):
             print('Skipping zero width exception!')
             assert 0
 
-    return bytes(bytecode), packed_excepts
+    return bytes(bytecode), packed_excepts, pos_bytecode_map
diff --git a/enjarify/jvm/optimization/options.py b/enjarify/jvm/optimization/options.py
@@ -15,7 +15,7 @@
 class Options:
     def __init__(self, inline_consts=False, prune_store_loads=False,
         copy_propagation=False, remove_unused_regs=False, dup2ize=False,
-        sort_registers=False, split_pool=False, delay_consts=False):
+        sort_registers=False, split_pool=False, delay_consts=False, translate_debug=False):
         self.inline_consts = inline_consts
         self.prune_store_loads = prune_store_loads
         self.copy_propagation = copy_propagation
@@ -24,6 +24,7 @@ def __init__(self, inline_consts=False, prune_store_loads=False,
         self.sort_registers = sort_registers
         self.split_pool = split_pool
         self.delay_consts = delay_consts
+        self.translate_debug = translate_debug
 
 NONE = Options()
 # Options which make the generated code more readable for humans

diff --git a/enjarify/jvm/optimization/registers.py b/enjarify/jvm/optimization/registers.py
@@ -160,8 +160,8 @@ def removeUnusedRegisters(irdata):
 # For simplicity, parameter registers are preserved as is
 def simpleAllocateRegisters(irdata):
     instrs = irdata.flat_instructions
-    regmap = {v:i for i,v in enumerate(irdata.initial_args)}
-    nextreg = len(irdata.initial_args)
+    regmap = {v: i for i,v in enumerate(irdata.initial_args) if v != None}
+    nextreg = len(regmap)
 
     for instr in instrs:
         if isinstance(instr, ir.RegAccess):
@@ -172,6 +172,7 @@ def simpleAllocateRegisters(irdata):
                     nextreg += 1
             instr.calcBytecode(regmap[instr.key])
     irdata.numregs = nextreg
+    irdata.regmap = {k:v for k,v in regmap.items()} # keep track of original registers for debug info
 
 # Sort registers by number of uses so that more frequently used registers will
 # end up in slots 0-3 or 4-255 and benefit from the shorter instruction forms
@@ -229,3 +230,4 @@ def sortAllocateRegisters(irdata):
     for instr in instrs:
         if instr.bytecode is None and isinstance(instr, ir.RegAccess):
             instr.calcBytecode(regmap[instr.key])
+    irdata.regmap = regmap
diff --git a/enjarify/jvm/writebytecode.py b/enjarify/jvm/writebytecode.py
@@ -14,7 +14,8 @@
 
 from ..byteio import Writer
 from . import writeir, ir
-from .optimization import registers, jumps, stack, consts
+from .optimization import registers, jumps, stack, consts, options
+from .writedebug import writeDebugAttributes
 
 def getCodeIR(pool, method, opts):
     if method.code is not None:
@@ -77,7 +78,7 @@ def finishCodeAttrs(pool, code_irs, opts):
 def writeCodeAttributeTail(pool, irdata, opts):
     method = irdata.method
     jumps.optimizeJumps(irdata)
-    bytecode, excepts = jumps.createBytecode(irdata, opts)
+    bytecode, excepts, pos_map = jumps.createBytecode(irdata)
 
     stream = Writer()
     # For simplicity, don't bother calculating the actual maximum stack height
@@ -95,5 +96,10 @@ def writeCodeAttributeTail(pool, irdata, opts):
     stream.write(b''.join(excepts))
 
     # attributes
-    stream.u16(0)
+    if opts.translate_debug and method.code.debug_info != None:
+        attr_count, attrs = writeDebugAttributes(pool, irdata, pos_map, len(bytecode), irdata.regmap)
+        stream.u16(attr_count)
+        stream.write(attrs)
+    else:
+        stream.u16(0)
     return stream
diff --git a/enjarify/jvm/writeclass.py b/enjarify/jvm/writeclass.py
@@ -44,19 +44,33 @@ def writeField(pool, stream, field):
     else:
         stream.u16(0) # no attributes
 
-def writeMethod(pool, stream, method, code_attr_data):
+def writeMethod(pool, stream, method, code_attr_data, opts):
     stream.u16(method.access & flags.METHOD_FLAGS)
     stream.u16(pool.utf8(method.id.name))
     stream.u16(pool.utf8(method.id.desc))
-
+
+    code_attr = 0
+    param_attr = 0
     if code_attr_data is not None:
+        code_attr = 1
+    if opts.translate_debug and method.code != None and method.code.debug_info != None and len(method.code.debug_info.parameter_names):
+        param_attr = 1
+    stream.u16(code_attr + param_attr)
+    if code_attr:
         code_attr_data = code_attr_data.toBytes()
-        stream.u16(1)
         stream.u16(pool.utf8(b"Code"))
         stream.u32(len(code_attr_data))
         stream.write(code_attr_data)
-    else:
-        stream.u16(0) # no attributes
+    if param_attr:
+        stream.u16(pool.utf8(b"MethodParameters"))
+        stream.u32(1 + len(method.code.debug_info.parameter_names) * 4)
+        stream.u8(len(method.code.debug_info.parameter_names))
+        for name in method.code.debug_info.parameter_names:
+            if name != None:
+                stream.u16(pool.utf8(name))
+            else:
+                stream.u16(0)
+            stream.u16(0) # dex doesn't have access_flags
 
 def writeMethods(pool, stream, methods, opts):
     code_irs = []
@@ -66,7 +80,7 @@ def writeMethods(pool, stream, methods, opts):
 
     stream.u16(len(methods))
     for method in methods:
-        writeMethod(pool, stream, method, code_attrs.get(method))
+        writeMethod(pool, stream, method, code_attrs.get(method), opts)
 
 def classFileAfterPool(cls, opts):
     stream = Writer()
@@ -115,8 +129,12 @@ def toClassFile(cls, opts):
     try:
         pool, rest_stream = classFileAfterPool(cls, opts=opts)
     except error.ClassfileLimitExceeded:
-        # print('Retrying {} with optimization enabled'.format(cls.name))
-        pool, rest_stream = classFileAfterPool(cls, opts=options.ALL)
+        print('Retrying {} with optimization enabled'.format(cls.name))
+        newopts = options.ALL
+        if opts.translate_debug:
+            newopts.translate_debug = True
+            newopts.sort_registers = False # breaks translate debug for some reason
+        pool, rest_stream = classFileAfterPool(cls, opts=newopts)
 
     # write constant pool
     pool.write(stream)