Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code object compression via bundling #1374

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions tensilelite/Tensile/BuildCommands/AssemblyCommands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import collections
import math
import os
import shutil
import subprocess

from pathlib import Path
from typing import List, Union

from .. import Utils
from ..TensileInstructions import getGfxName
from ..Common import globalParameters, print2, ensurePath, printWarning
from ..KernelWriterAssembly import KernelWriterAssembly
from .SharedCommands import compressCodeObject

def _linkIntoCodeObject(
objFiles: List[str], coPathDest: Union[Path, str], kernelWriterAssembly: KernelWriterAssembly
):
"""Links object files into a code object file.

Args:
objectFiles: A list of object files to be linked.
coPathDest: The destination path for the code object file.
kernelWriterAssembly: An instance of KernelWriterAssembly to get link arguments.

Raises:
RuntimeError: If linker invocation fails.
"""
if os.name == "nt":
# Use args file on Windows b/c the command may exceed the limit of 8191 characters
with open(Path.cwd() / "clangArgs.txt", 'wt') as file:
file.write(" ".join(objFiles))
file.flush()
args = [globalParameters['AssemblerPath'], '-target', 'amdgcn-amd-amdhsa', '-o', coFileRaw, '@clangArgs.txt']
subprocess.check_call(args, cwd=asmDir)
else:
numObjFiles = len(objFiles)
maxObjFiles = 10000

if numObjFiles > maxObjFiles:
batchedObjFiles = [objFiles[i:i+maxObjFiles] for i in range(0, numObjFiles, maxObjFiles)]
numBatches = int(math.ceil(numObjFiles / maxObjFiles))

newObjFiles = [str(coPathDest) + "." + str(i) for i in range(0, numBatches)]
newObjFilesOutput = []

for batch, filename in zip(batchedObjFiles, newObjFiles):
if len(batch) > 1:
args = [globalParameters["ROCmLdPath"], "-r"] + batch + [ "-o", filename]
print2(f"Linking object files into fewer object files: {' '.join(args)}")
subprocess.check_call(args)
newObjFilesOutput.append(filename)
else:
newObjFilesOutput.append(batchedObjFiles[0])

objFiles = newObjFilesOutput

args = kernelWriterAssembly.getLinkCodeObjectArgs(objFiles, str(coPathDest))
print2(f"Linking object files into code object: {' '.join(args)}")
subprocess.check_call(args)



def buildAssemblyCodeObjectFiles(kernels, kernelWriterAssembly, outputPath):

isAsm = lambda k: k["KernelLanguage"] == "Assembly"

extObj = ".o"
extCo = ".co"
extCoRaw = ".co.raw"

destDir = Path(ensurePath(os.path.join(outputPath, 'library')))
asmDir = Path(kernelWriterAssembly.getAssemblyDirectory())

archKernelMap = collections.defaultdict(list)
for k in filter(isAsm, kernels):
archKernelMap[tuple(k['ISA'])].append(k)

coFiles = []
for arch, archKernels in archKernelMap.items():
if len(archKernels) == 0:
continue

gfx = getGfxName(arch)

if globalParameters["MergeFiles"] or globalParameters["NumMergedFiles"] > 1 or globalParameters["LazyLibraryLoading"]:
objectFiles = [str(asmDir / (kernelWriterAssembly.getKernelFileBase(k) + extObj)) for k in archKernels if 'codeObjectFile' not in k]

coFileMap = collections.defaultdict(list)

if len(objectFiles):
coFileMap[asmDir / ("TensileLibrary_"+ gfx + extCoRaw)] = objectFiles

for kernel in archKernels:
coName = kernel.get("codeObjectFile", None)
if coName:
coFileMap[asmDir / (coName + extCoRaw)].append(str(asmDir / (kernelWriterAssembly.getKernelFileBase(kernel) + extObj)))

for coFileRaw, objFiles in coFileMap.items():

_linkIntoCodeObject(objFiles, coFileRaw, kernelWriterAssembly)
coFile = destDir / coFileRaw.name.replace(extCoRaw, extCo)
compressCodeObject(coFileRaw, coFile, gfx, globalParameters["ClangOffloadBundlerPath"])

coFiles.append(coFile)
else:
# no mergefiles
def newCoFileName(kName):
if globalParameters["PackageLibrary"]:
return os.path.join(destDir, gfx, kName + '.co')
else:
return os.path.join(destDir, kName + '_' + gfx + '.co')

def orgCoFileName(kName):
return os.path.join(asmDir, kName + '.co')

for src, dst in Utils.tqdm(((orgCoFileName(kName), newCoFileName(kName)) for kName in \
map(lambda k: kernelWriterAssembly.getKernelFileBase(k), archKernels)), "Copying code objects"):
shutil.copyfile(src, dst)
coFiles.append(dst)
printWarning("Code object files are not compressed in `--no-merge-files` build mode.")

return coFiles
40 changes: 40 additions & 0 deletions tensilelite/Tensile/BuildCommands/SharedCommands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import subprocess

from typing import Union
from pathlib import Path

from ..Common import print2

def compressCodeObject(
coPathSrc: Union[Path, str], coPathDest: Union[Path, str], gfx: str, bundler: str
):
"""Compresses a code object file using the provided bundler.

Args:
coPathSrc: The source path of the code object file to be compressed.
coPathDest: The destination path for the compressed code object file.
gfx: The target GPU architecture.
bundler: The path to the Clang Offload Bundler executable.

Raises:
RuntimeError: If compressing the code object file fails.
"""
args = [
bundler,
"--compress",
"--type=o",
"--bundle-align=4096",
f"--targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--{gfx}",
"--input=/dev/null",
f"--input={str(coPathSrc)}",
f"--output={str(coPathDest)}",
]

print2(f"Bundling/compressing code objects: {' '.join(args)}")
try:
out = subprocess.check_output(args, stderr=subprocess.STDOUT)
print2(f"Output: {out}")
except subprocess.CalledProcessError as err:
raise RuntimeError(
f"Error compressing code object via bundling: {err.output}\nFailed command: {' '.join(args)}"
)
199 changes: 199 additions & 0 deletions tensilelite/Tensile/BuildCommands/SourceCommands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
import itertools
import os
import re
import shlex
import shutil
import subprocess
from pathlib import Path
from typing import Iterable, List, Union

from ..Common import globalParameters, print2, ensurePath, supportedCompiler, ParallelMap2, splitArchs, which
from .SharedCommands import compressCodeObject

def _compileSourceObjectFile(cmdlineArchs: List[str], cxxCompiler: str, cxxSrcPath: str, objDestPath: str, outputPath: str):
"""Compiles a source file into an object file.

Args:
cmdlineArchs: List of architectures for offloading.
cxxCompiler: The C++ compiler to use.
kernelFile: The path to the kernel source file.
buildPath: The build directory path.
objectFilename: The name of the output object file.
outputPath: The output directory path.
globalParameters: A dictionary of global parameters.

Raises:
RuntimeError: If the compilation command fails.
"""
archFlags = ['--offload-arch=' + arch for arch in cmdlineArchs]

#TODO(@jichangjichang) Needs to be fixed when Maneesh's change is made available
hipFlags = ["-D__HIP_HCC_COMPAT_MODE__=1"]
hipFlags.extend(
["--genco"] if cxxCompiler == "hipcc" else ["--cuda-device-only", "-x", "hip", "-O3"]
)

hipFlags.extend(['-I', outputPath])
hipFlags.extend(["-Xoffload-linker", "--build-id=%s"%globalParameters["BuildIdKind"]])
hipFlags.append('-std=c++17')
if globalParameters["AsanBuild"]:
hipFlags.extend(["-fsanitize=address", "-shared-libasan", "-fuse-ld=lld"])
if globalParameters["SaveTemps"]:
hipFlags.append('--save-temps')

launcher = shlex.split(os.environ.get('Tensile_CXX_COMPILER_LAUNCHER', ''))

if os.name == "nt":
hipFlags.extend(['-fms-extensions', '-fms-compatibility', '-fPIC', '-Wno-deprecated-declarations'])

args = launcher + [which(cxxCompiler)] + hipFlags + archFlags + [cxxSrcPath, '-c', '-o', objDestPath]

try:
out = subprocess.check_output(args, stderr=subprocess.STDOUT)
print2(f"Output: {out}" if out else "")
except subprocess.CalledProcessError as err:
raise RuntimeError(f"Error compiling source object file: {err.output}\nFailed command: {' '.join(args)}")


def _listTargetTriples(bundler: str, objFile: str) -> List[str]:
"""Lists the target triples in an object file.

Args:
bundler: The path to the bundler, typically ``clang-offload-bundler``.
objFile: The object file path.

Returns:
List of target triples in the object file.
"""
args = [bundler, "--type=o", f"--input={objFile}", "-list"]
try:
listing = subprocess.check_output(args, stderr=subprocess.STDOUT).decode().split("\n")
except subprocess.CalledProcessError as err:
raise RuntimeError(f"Error listing target triples in object files: {err.output}\nFailed command: {' '.join(args)}")
return listing


def _computeSourceCodeObjectFilename(target: str, base: str, buildPath: Union[Path, str], arch: str) -> Path:
"""Generates a code object file path using the target, base, and build path.

Args:
target: The target triple.
base: The base name for the output file (name without extension).
buildPath: The build directory path.

Returns:
Path to the code object file.
"""
coPath = None
buildPath = Path(buildPath)
if "TensileLibrary" in base and "fallback" in base:
coPath = buildPath / "{0}_{1}.hsaco.raw".format(base, arch)
elif "TensileLibrary" in base:
variant = [t for t in ["", "xnack-", "xnack+"] if t in target][-1]
baseVariant = base + "-" + variant if variant else base
if arch in baseVariant:
coPath = buildPath / (baseVariant + ".hsaco.raw")
else:
raise RuntimeError(
"Failed to compute code object name:"
f"Could not find variant {variant} in base {baseVariant}"
)
else:
coPath= buildPath / "{0}.so-000-{1}.hsaco.raw".format(base, arch)

return coPath


def _unbundleSourceCodeObjects(bundler: str, target: str, infile: str, outfileRaw: str):
"""Unbundles source code object files using the Clang Offload Bundler.

Args:
bundler: The path to the bundler, typically ``clang-offload-bundler``.
target: The target architecture string.
infile: The input file path.
outfileRaw: The output raw file path.

Raises:
RuntimeError: If unbundling the source code object file fails.
"""
args = [
bundler,
"--type=o",
f"--targets={target}",
f"--input={infile}",
f"--output={outfileRaw}",
"--unbundle",
]

print2("Unbundling source code object file: " + " ".join(args))
try:
out = subprocess.check_output(args, stderr=subprocess.STDOUT)
print2(f"Output: {out}" if out else "")
except subprocess.CalledProcessError as err:
raise RuntimeError(f"Error unbundling source code object file: {err.output}\nFailed command: {' '.join(args)}")


def _buildSourceCodeObjectFile(cxxCompiler: str, outputPath: Union[Path, str], kernelPath: Union[Path, str]) -> List[str]:
"""Compiles a HIP source code file into a code object file.

Args:
cxxCompiler: The C++ compiler to use.
outputPath: The output directory path where code objects will be placed.
kernelPath: The path to the kernel source file.

Returns:
List of paths to the created code objects.
"""
buildPath = Path(ensurePath(os.path.join(globalParameters['WorkingPath'], 'code_object_tmp')))
destPath = Path(ensurePath(os.path.join(outputPath, 'library')))
kernelPath = Path(kernelPath)

if "CmakeCxxCompiler" in globalParameters and globalParameters["CmakeCxxCompiler"] is not None:
os.environ["CMAKE_CXX_COMPILER"] = globalParameters["CmakeCxxCompiler"]

objFilename = kernelPath.stem + '.o'
coPathsRaw = []
coPaths= []

if not supportedCompiler(cxxCompiler):
raise RuntimeError("Unknown compiler {}".format(cxxCompiler))

_, cmdlineArchs = splitArchs()

objPath = str(buildPath / objFilename)
_compileSourceObjectFile(cmdlineArchs, cxxCompiler, str(kernelPath), objPath, str(outputPath))

bundler = globalParameters["ClangOffloadBundlerPath"]
if not bundler:
raise RuntimeError("No bundler found; set TENSILE_ROCM_OFFLOAD_BUNDLER_PATH to point to clang-offload-bundler")

for target in _listTargetTriples(bundler, objPath):
if match := re.search("gfx.*$", target):
arch = re.sub(":", "-", match.group())
coPathRaw = _computeSourceCodeObjectFilename(target, kernelPath.stem, buildPath, arch)
_unbundleSourceCodeObjects(bundler, target, objPath, str(coPathRaw))

coPath = str(destPath / coPathRaw.stem)
coPathsRaw.append(coPathRaw)
coPaths.append(coPath)

for src, dst in zip(coPathsRaw, coPaths):
shutil.move(src, dst)

return coPaths

def buildSourceCodeObjectFiles(cxxCompiler: str, kernelFiles: List[Path], outputPath: Path) -> Iterable[str]:
"""Compiles HIP source code files into code object files.

Args:
cxxCompiler: The C++ compiler to use.
kernelFiles: List of paths to the kernel source files.
outputPath: The output directory path where code objects will be placed.
removeTemporaries: Whether to clean up temporary files.

Returns:
List of paths to the created code objects.
"""
args = zip(itertools.repeat(cxxCompiler), itertools.repeat(outputPath), kernelFiles)
coFiles = ParallelMap2(_buildSourceCodeObjectFile, args, "Compiling source kernels")
return itertools.chain.from_iterable(coFiles)
Empty file.
Loading