Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Working on linux without GUI #14

Open
aokocax opened this issue Dec 7, 2023 · 0 comments
Open

Working on linux without GUI #14

aokocax opened this issue Dec 7, 2023 · 0 comments

Comments

@aokocax
Copy link

aokocax commented Dec 7, 2023

Hello, I updated the maxperf file for systems that only run on CLI. It does not save the files to disk at the moment. While I was checking to see if there was any problem, I noticed that it created only 6 images in the genImage function. Sorry for the code I am actualy C# developer :)

`
import sys
import PIL
import os

import numpy as np

import torch
from diffusers import AutoPipelineForText2Image
from sfast.compilers.stable_diffusion_pipeline_compiler import (compile, CompilationConfig)

torch.set_grad_enabled(False)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

mw = None
batchSize = 10
prompts = ['Evil space kitty', 'Cute dog in hat, H.R. Giger style', 'Horse wearing a tie', 'Cartoon pig', 'Donkey on Mars', 'Cute kitties baked in a cake', 'Boxing chickens on farm, Maxfield Parish style', 'Future spaceship', 'A city of the past', 'Jabba the Hut wearing jewelery',
'istanbul photo scenery', 'a nice girl with hat','a dog playing footbal','an umbrella and raining',' paper cut plane flying on a desk','a cup coffee and child toys','space ship on a lake','a knife and a fork on a table','futuristic microphone','an apple, a banana, a melon']

def dwencode(pipe, prompts, batchSize: int, nTokens: int):
tokenizer = pipe.tokenizer
text_encoder = pipe.text_encoder

if nTokens < 0 or nTokens > 75:
    raise BaseException("n random tokens must be between 0 and 75")

if nTokens > 0:
    randIIs = torch.randint(low=0, high=49405, size=(batchSize, nTokens), device='cuda')

text_inputs = tokenizer(
    prompts,
    padding = "max_length",
    max_length = tokenizer.model_max_length,
    truncation = True,
    return_tensors = "pt",
).to('cuda')

tii = text_inputs.input_ids

# Find the end mark which is deterimine the prompt len(pl)
# terms of user tokens
#pl = np.where(tii[0] == 49407)[0][0] - 1
pl = (tii[0] == torch.tensor(49407, device='cuda')).nonzero()[0][0].item() - 1

if nTokens > 0:
    # TODO: Efficiency
    for i in range(batchSize):
        tii[i][1+pl:1+pl+nTokens] = randIIs[i]
        tii[i][1+pl+nTokens] = 49407

if False:
    for bi in range(batchSize):
        print(f"{mw.seqno:05d}-{bi:02d}: ", end='')
        for tid in tii[bi][1:1+pl+nTokens]:
            print(f"{tokenizer.decode(tid)} ", end='')
        print('')

prompt_embeds = text_encoder(tii.to('cuda'), attention_mask=None)
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.to(dtype=pipe.unet.dtype, device='cuda')

bs_embed, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.repeat(1, 1, 1)
prompt_embeds = prompt_embeds.view(bs_embed * 1, seq_len, -1)

return prompt_embeds

pipe = AutoPipelineForText2Image.from_pretrained("stabilityai/sd-turbo", torch_dtype=torch.float16, variant="fp16")
pipe.to("cuda")
#pipe.unet.to(memory_format=torch.channels_last)

from diffusers import AutoencoderTiny
pipe.vae = AutoencoderTiny.from_pretrained('madebyollin/taesd', torch_device='cuda', torch_dtype=torch.float16)
pipe.vae = pipe.vae.cuda()

pipe.set_progress_bar_config(disable=True)

if True:
config = CompilationConfig.Default()

# xformers and Triton are suggested for achieving best performance.
# It might be slow for Triton to generate, compile and fine-tune kernels.
try:
    import xformers
    config.enable_xformers = True
except ImportError:
    print('xformers not installed, skip')
# NOTE:
# When GPU VRAM is insufficient or the architecture is too old, Triton might be slow.
# Disable Triton if you encounter this problem.
try:
    import triton
    config.enable_triton = True
except ImportError:
    print('Triton not installed, skip')
# NOTE:
# CUDA Graph is suggested for small batch sizes and small resolutions to reduce CPU overhead.
# My implementation can handle dynamic shape with increased need for GPU memory.
# But when your GPU VRAM is insufficient or the image resolution is high,
# CUDA Graph could cause less efficient VRAM utilization and slow down the inference,
# especially when on Windows or WSL which has the "shared VRAM" mechanism.
# If you meet problems related to it, you should disable it.
config.enable_cuda_graph = True

if True:
    config.enable_jit = True
    config.enable_jit_freeze = True
    config.trace_scheduler = True
    config.enable_cnn_optimization = True
    config.preserve_parameters = False
    config.prefer_lowp_gemm = True

pipe = compile(pipe, config)




def genImage(output_dir, seqno, prompts, batchSize):
    global pipe
    seed = random.randint(0, 2147483647)
    torch.manual_seed(seed)

    images = genit(0, prompts=prompts, batchSize=batchSize, nSteps=1)
    for idx, img in enumerate(images):
        img_path = os.path.join(output_dir, f'image_{seqno}_{idx}.png')
        #img.save(img_path)
        print(img_path)
    return len(images)

import time
import random
import torch

def genit(mode, prompts, batchSize, nSteps):
#tm0 = time.time()
pe = dwencode(pipe, prompts, batchSize, 9)
images = pipe(
prompt_embeds = pe,
width=512, height=512,
num_inference_steps = nSteps,
guidance_scale = 1,
output_type="pil",
return_dict=False
)[0]
#print(f"time = {(1000*(time.time() - tm0)):3.1f} milliseconds")

return images

if name == 'main':
output_dir = 'spew'
if not os.path.exists(output_dir):
os.makedirs(output_dir)

seqno = 0
if len(sys.argv) == 2:
    batchSize = int(sys.argv[1])
    if batchSize > 20:
        print('Batchsize must not be greater than 20.')
        sys.exit(1)
    prompts = prompts[:batchSize]
else:
    batchSize = 20
start_time = time.time()
counter = 0

while True:
    seqno += 1
    counter += 1
    genImage(output_dir, seqno, prompts, batchSize)
    current_time = time.time()
    if current_time - start_time >= 1:  
        print(f"{counter} iterations in the last second.")
        start_time = current_time  
        counter = 0  #

`

@aokocax aokocax changed the title Linux on without GUI Working on linux without GUI Dec 7, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant