-
Notifications
You must be signed in to change notification settings - Fork 1
/
api.py
133 lines (89 loc) · 3.03 KB
/
api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import typing
from pydantic import BaseModel
MAX_IMAGE_SIZE = (768, 768)
class PipelineInfo(BaseModel):
upload_urls: typing.List[str] = []
model_id: str
scheduler: str = None
seed: int = 42
disable_safety_checker: bool = False
class DiffusersInputs(BaseModel):
prompt: typing.List[str]
negative_prompt: typing.List[str] = None
num_images_per_prompt: int = 1
num_inference_steps: int = 50
guidance_scale: float = 7.5
class Text2ImgInputs(DiffusersInputs):
width: int
height: int
class Img2ImgInputs(DiffusersInputs):
image: typing.List[str]
strength: float
class InpaintInputs(DiffusersInputs):
image: typing.List[str]
mask_image: typing.List[str]
class UpscaleInputs(DiffusersInputs):
image: typing.List[str]
class InstructPix2PixInputs(DiffusersInputs):
image: typing.List[str]
image_guidance_scale: float
class ControlNetPipelineInfo(PipelineInfo):
controlnet_model_id: typing.Union[str, typing.List[str]]
disable_preprocessing: bool = False
class ControlNetInputs(DiffusersInputs):
image: typing.List[str]
controlnet_conditioning_scale: typing.Union[float, typing.List[float]] = 1.0
eta: float = 0
guess_mode: bool = False
class ControlNetImg2ImgInputs(ControlNetInputs):
control_image: typing.List[str]
class ControlNetInpaintInputs(ControlNetInputs):
mask_image: typing.List[str]
class AudioLDMInputs(BaseModel):
prompt: typing.List[str]
negative_prompt: typing.List[str] = None
num_waveforms_per_prompt: int = 1
num_inference_steps: int = 10
guidance_scale: float = 2.5
audio_length_in_s: float = 5.12
class VQAInput(BaseModel):
image: typing.List[str]
question: typing.List[str]
# https://github.com/salesforce/LAVIS/blob/7aa83e93003dade66f7f7eaba253b10c459b012d/lavis/models/blip_models/blip_vqa.py#L162
num_beams: int = 3
inference_method: str = "generate"
max_len: int = 10
min_len: int = 1
num_ans_candidates: int = 128
class ImageCaptioningInput(BaseModel):
image: typing.List[str]
# https://github.com/salesforce/LAVIS/blob/7aa83e93003dade66f7f7eaba253b10c459b012d/lavis/models/blip_models/blip_caption.py#L136
num_beams = 3
max_length = 30
min_length = 10
repetition_penalty = 1.0
num_captions = 1
class WhisperInputs(BaseModel):
audio: str
task: typing.Literal["translate", "transcribe"] = "transcribe"
language: str = None
return_timestamps: bool = False
decoder_kwargs: dict = None
chunk_length_s: float = 30
stride_length_s: typing.Tuple[float, float] = (6, 0)
batch_size: int = 16
class MMSInputs(BaseModel):
audio: str
language: str
return_timestamps: bool = False
chunk_length_s: float = 30
stride_length_s: typing.Tuple[float, float] = (6, 0)
batch_size: int = 8
class NemoASRInputs(BaseModel):
audio: str
class AsrOutputChunk(BaseModel):
timestamp: typing.Tuple[float, float]
text: str
class AsrOutput(BaseModel):
text: str
chunks: typing.List[AsrOutputChunk] = []