From 491a5c15ca162a52f4e49024a473a0d1ee271465 Mon Sep 17 00:00:00 2001 From: ylfeng Date: Sat, 8 Jun 2024 12:39:50 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E8=A1=A5=20ner,=20srl=20=E4=B8=AD?= =?UTF-8?q?=E7=9A=84=20start,=20end?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/core/ltp_core/algorithms/__init__.py | 1 - .../datamodules/adapters/postagger.py | 4 +- .../datamodules/adapters/segmention.py | 4 +- .../adapters/semantic_dependency_parsing.py | 4 +- .../ltp_core/datamodules/components/bio.py | 8 +- .../ltp_core/datamodules/components/conllu.py | 48 +++--- .../ltp_core/datamodules/components/srl.py | 25 ++- .../datamodules/multi_task_datamodule.py | 12 +- .../ltp_core/datamodules/utils/collate.py | 6 +- .../ltp_core/datamodules/utils/datasets.py | 6 +- .../datamodules/utils/multitask_dataloader.py | 4 +- .../datamodules/utils/vocab_helper.py | 4 +- .../core/ltp_core/models/components/graph.py | 4 +- .../core/ltp_core/models/criterion/token.py | 19 +-- python/core/ltp_core/models/lit_model.py | 10 +- python/core/ltp_core/models/metrics/token.py | 6 +- python/core/ltp_core/models/nn/crf.py | 17 +- .../models/nn/relative_transformer.py | 36 +--- .../ltp_core/models/optimization/layer_lrs.py | 4 +- .../core/ltp_core/models/utils/instantiate.py | 12 +- python/core/ltp_core/utils/utils.py | 9 +- python/core/tests/test_crf.py | 54 +++--- python/extension/Cargo.toml | 2 +- python/extension/utils/stub.py | 18 +- python/interface/examples/issues.py | 11 +- python/interface/examples/server.py | 161 ++++++++++++++++++ python/interface/ltp/generic.py | 4 +- python/interface/ltp/interface.py | 7 - python/interface/ltp/legacy.py | 15 +- python/interface/ltp/module.py | 4 +- python/interface/ltp/nerual.py | 30 +--- rust/ltp/Cargo.toml | 2 +- 32 files changed, 280 insertions(+), 271 deletions(-) create mode 100644 python/interface/examples/server.py diff --git a/python/core/ltp_core/algorithms/__init__.py b/python/core/ltp_core/algorithms/__init__.py index 26eddde9..6480b288 100644 --- a/python/core/ltp_core/algorithms/__init__.py +++ b/python/core/ltp_core/algorithms/__init__.py @@ -3,7 +3,6 @@ from ltp_extension.algorithms import eisner as rust_eisner def eisner(scores, mask, remove_root=False): - scores = scores.view(-1).cpu().numpy() length = torch.sum(mask, dim=1).cpu().numpy() diff --git a/python/core/ltp_core/datamodules/adapters/postagger.py b/python/core/ltp_core/datamodules/adapters/postagger.py index 68d63633..e698cfa5 100644 --- a/python/core/ltp_core/datamodules/adapters/postagger.py +++ b/python/core/ltp_core/datamodules/adapters/postagger.py @@ -48,9 +48,7 @@ def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs): os.environ["TOKENIZERS_PARALLELISM"] = "true" dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir) - dataset = dataset.remove_columns( - ["id", "lemma", "upos", "feats", "head", "deprel", "deps", "misc"] - ) + dataset = dataset.remove_columns(["id", "lemma", "upos", "feats", "head", "deprel", "deps", "misc"]) dataset = dataset.rename_column("xpos", "labels") dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True) dataset = dataset.filter(lambda x: not x["overflow"]) diff --git a/python/core/ltp_core/datamodules/adapters/segmention.py b/python/core/ltp_core/datamodules/adapters/segmention.py index d0138508..88c286ad 100644 --- a/python/core/ltp_core/datamodules/adapters/segmention.py +++ b/python/core/ltp_core/datamodules/adapters/segmention.py @@ -67,9 +67,7 @@ def build_dataset(data_dir, task_name, tokenizer, max_length=512, mode="bmes", * os.environ["TOKENIZERS_PARALLELISM"] = "true" dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir) - dataset = dataset.remove_columns( - ["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc"] - ) + dataset = dataset.remove_columns(["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc"]) if mode == "bmes": dataset = dataset.map( lambda examples: tokenize(examples, tokenizer, max_length, length2bmes), diff --git a/python/core/ltp_core/datamodules/adapters/semantic_dependency_parsing.py b/python/core/ltp_core/datamodules/adapters/semantic_dependency_parsing.py index 57853049..97621aa1 100644 --- a/python/core/ltp_core/datamodules/adapters/semantic_dependency_parsing.py +++ b/python/core/ltp_core/datamodules/adapters/semantic_dependency_parsing.py @@ -51,9 +51,7 @@ def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs): os.environ["TOKENIZERS_PARALLELISM"] = "true" dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir) - dataset = dataset.remove_columns( - ["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "misc"] - ) + dataset = dataset.remove_columns(["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "misc"]) dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True) dataset = dataset.filter(lambda x: not x["overflow"]) dataset.set_format( diff --git a/python/core/ltp_core/datamodules/components/bio.py b/python/core/ltp_core/datamodules/components/bio.py index 5da37a1f..4bb894bf 100644 --- a/python/core/ltp_core/datamodules/components/bio.py +++ b/python/core/ltp_core/datamodules/components/bio.py @@ -87,17 +87,13 @@ def _info(self): def _split_generators(self, dl_manager): """We handle string, list and dicts in datafiles.""" if not self.config.data_files: - raise ValueError( - f"At least one data file must be specified, but got data_files={self.config.data_files}" - ) + raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") data_files = dl_manager.download_and_extract(self.config.data_files) if isinstance(data_files, (str, list, tuple)): files = data_files if isinstance(files, str): files = [files] - return [ - datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files}) - ] + return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})] splits = [] for split_name, files in data_files.items(): if isinstance(files, str): diff --git a/python/core/ltp_core/datamodules/components/conllu.py b/python/core/ltp_core/datamodules/components/conllu.py index 1c63f08f..07073591 100644 --- a/python/core/ltp_core/datamodules/components/conllu.py +++ b/python/core/ltp_core/datamodules/components/conllu.py @@ -66,10 +66,7 @@ def build_vocabs(data_dir, *files, min_freq=5): counter.update(itertools.chain(*values[row])) elif "deps" == name: try: - deps = [ - [label.split(":", maxsplit=1)[1] for label in dep.split("|")] - for dep in values[row] - ] + deps = [[label.split(":", maxsplit=1)[1] for label in dep.split("|")] for dep in values[row]] counter.update(itertools.chain(*deps)) except Exception: counter.update("_") @@ -166,17 +163,13 @@ def _info(self): def _split_generators(self, dl_manager): """We handle string, list and dicts in datafiles.""" if not self.config.data_files: - raise ValueError( - f"At least one data file must be specified, but got data_files={self.config.data_files}" - ) + raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") data_files = dl_manager.download_and_extract(self.config.data_files) if isinstance(data_files, (str, list, tuple)): files = data_files if isinstance(files, str): files = [files] - return [ - datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files}) - ] + return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})] splits = [] for split_name, files in data_files.items(): if isinstance(files, str): @@ -189,13 +182,9 @@ def _generate_examples(self, files): logging.info("⏳ Generating examples from = %s", filename) for line_num, block in iter_blocks(filename=filename): # last example - id, words, lemma, upos, xpos, feats, head, deprel, deps, misc = ( - list(value) for value in zip(*block) - ) + id, words, lemma, upos, xpos, feats, head, deprel, deps, misc = (list(value) for value in zip(*block)) if self.config.deps: - deps = [ - [label.split(":", maxsplit=1) for label in dep.split("|")] for dep in deps - ] + deps = [[label.split(":", maxsplit=1) for label in dep.split("|")] for dep in deps] deps = [ [{"id": depid, "head": int(label[0]), "rel": label[-1]} for label in dep] for depid, dep in enumerate(deps) @@ -204,18 +193,21 @@ def _generate_examples(self, files): if any([dep["head"] >= len(words) for dep in deps]): continue - yield line_num, { - "id": id, - "form": words, - "lemma": lemma, - "upos": upos, - "xpos": xpos, - "feats": feats, - "head": head, - "deprel": deprel, - "deps": deps, - "misc": misc, - } + yield ( + line_num, + { + "id": id, + "form": words, + "lemma": lemma, + "upos": upos, + "xpos": xpos, + "feats": feats, + "head": head, + "deprel": deprel, + "deps": deps, + "misc": misc, + }, + ) def main(): diff --git a/python/core/ltp_core/datamodules/components/srl.py b/python/core/ltp_core/datamodules/components/srl.py index 4c3efff2..2ef41efb 100644 --- a/python/core/ltp_core/datamodules/components/srl.py +++ b/python/core/ltp_core/datamodules/components/srl.py @@ -107,9 +107,7 @@ def _info(self): { "form": datasets.Sequence(datasets.Value("string")), "predicate": datasets.Sequence(create_feature(feats["predicate"])), - "arguments": datasets.Sequence( - datasets.Sequence(create_feature(feats["arguments"])) - ), + "arguments": datasets.Sequence(datasets.Sequence(create_feature(feats["arguments"]))), } ), supervised_keys=None, @@ -118,17 +116,13 @@ def _info(self): def _split_generators(self, dl_manager): """We handle string, list and dicts in datafiles.""" if not self.config.data_files: - raise ValueError( - f"At least one data file must be specified, but got data_files={self.config.data_files}" - ) + raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") data_files = dl_manager.download_and_extract(self.config.data_files) if isinstance(data_files, (str, list, tuple)): files = data_files if isinstance(files, str): files = [files] - return [ - datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files}) - ] + return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})] splits = [] for split_name, files in data_files.items(): if isinstance(files, str): @@ -143,11 +137,14 @@ def _generate_examples(self, files): # last example words, predicate, *roles = (list(value) for value in zip(*block)) - yield line_num, { - "form": words, - "predicate": predicate, - "arguments": roles, - } + yield ( + line_num, + { + "form": words, + "predicate": predicate, + "arguments": roles, + }, + ) def main(): diff --git a/python/core/ltp_core/datamodules/multi_task_datamodule.py b/python/core/ltp_core/datamodules/multi_task_datamodule.py index 91354a9a..68cd82e4 100644 --- a/python/core/ltp_core/datamodules/multi_task_datamodule.py +++ b/python/core/ltp_core/datamodules/multi_task_datamodule.py @@ -46,9 +46,7 @@ def __init__(self, tokenizer, datamodules, tau=0.8, num_workers=None, pin_memory # data transformations self.tokenizer = AutoTokenizer.from_pretrained(tokenizer) - self.datamodules = { - task: info.load(tokenizer=self.tokenizer) for task, info in datamodules.items() - } + self.datamodules = {task: info.load(tokenizer=self.tokenizer) for task, info in datamodules.items()} self.data_train: Optional[Dict[str, Dataset]] = { name: dataset[datasets.Split.TRAIN] for name, dataset in self.datamodules.items() } @@ -87,14 +85,12 @@ def train_dataloader(self): dataset=dataset, collate_fn=collate, batch_size=self.hparams.datamodules[name].batch_size, - num_workers=self.hparams.num_workers - or self.hparams.datamodules[name].num_workers, - pin_memory=self.hparams.pin_memory - or self.hparams.datamodules[name].pin_memory, + num_workers=self.hparams.num_workers or self.hparams.datamodules[name].num_workers, + pin_memory=self.hparams.pin_memory or self.hparams.datamodules[name].pin_memory, shuffle=True, ) for name, dataset in self.data_train.items() - } + }, ) def val_dataloader(self): diff --git a/python/core/ltp_core/datamodules/utils/collate.py b/python/core/ltp_core/datamodules/utils/collate.py index 96921883..c6fc2ba7 100644 --- a/python/core/ltp_core/datamodules/utils/collate.py +++ b/python/core/ltp_core/datamodules/utils/collate.py @@ -34,11 +34,7 @@ def collate(batch): return torch.stack(batch, 0, out=out) except Exception: return torch.nn.utils.rnn.pad_sequence(batch, batch_first=True) - elif ( - elem_type.__module__ == "numpy" - and elem_type.__name__ != "str_" - and elem_type.__name__ != "string_" - ): + elif elem_type.__module__ == "numpy" and elem_type.__name__ != "str_" and elem_type.__name__ != "string_": elem = batch[0] if elem_type.__name__ == "ndarray": # array of string classes and object diff --git a/python/core/ltp_core/datamodules/utils/datasets.py b/python/core/ltp_core/datamodules/utils/datasets.py index 67f7dd8f..11ac291c 100644 --- a/python/core/ltp_core/datamodules/utils/datasets.py +++ b/python/core/ltp_core/datamodules/utils/datasets.py @@ -7,14 +7,12 @@ def load_dataset( builder_cls: type, config_name: Optional[str] = None, data_dir: Optional[str] = None, - data_files: Optional[ - Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]] - ] = None, + data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, split: Optional[Union[str, Split]] = None, cache_dir: Optional[str] = None, features: Optional[Features] = None, save_infos: bool = False, - **config_kwargs + **config_kwargs, ) -> Union[DatasetDict, Dataset]: # Instantiate the dataset builder builder_instance: DatasetBuilder = builder_cls( diff --git a/python/core/ltp_core/datamodules/utils/multitask_dataloader.py b/python/core/ltp_core/datamodules/utils/multitask_dataloader.py index 139bab15..fbce434d 100644 --- a/python/core/ltp_core/datamodules/utils/multitask_dataloader.py +++ b/python/core/ltp_core/datamodules/utils/multitask_dataloader.py @@ -14,9 +14,7 @@ def __init__(self, tau=1.0, **dataloaders): self.dataloaders = dataloaders Z = sum(pow(v, tau) for v in self.dataloader_sizes.values()) - self.tasknames, self.sampling_weights = zip( - *((k, pow(v, tau) / Z) for k, v in self.dataloader_sizes.items()) - ) + self.tasknames, self.sampling_weights = zip(*((k, pow(v, tau) / Z) for k, v in self.dataloader_sizes.items())) self.dataiters = {k: cycle(v) for k, v in dataloaders.items()} @property diff --git a/python/core/ltp_core/datamodules/utils/vocab_helper.py b/python/core/ltp_core/datamodules/utils/vocab_helper.py index 42e11a13..62e30204 100644 --- a/python/core/ltp_core/datamodules/utils/vocab_helper.py +++ b/python/core/ltp_core/datamodules/utils/vocab_helper.py @@ -4,9 +4,7 @@ def vocab_builder(func): def func_wrapper(config: BuilderConfig, **kwargs): """We handle string, list and dicts in datafiles.""" if not config.data_files: - raise ValueError( - f"At least one data file must be specified, but got data_files={config.data_files}" - ) + raise ValueError(f"At least one data file must be specified, but got data_files={config.data_files}") data_files = config.data_files if isinstance(data_files, (str, list, tuple)): files = data_files diff --git a/python/core/ltp_core/models/components/graph.py b/python/core/ltp_core/models/components/graph.py index 50544cde..96b0d74a 100644 --- a/python/core/ltp_core/models/components/graph.py +++ b/python/core/ltp_core/models/components/graph.py @@ -38,9 +38,7 @@ def __init__( ) self.arc_atten = Biaffine(arc_hidden_size, arc_hidden_size, 1, bias_x=True, bias_y=False) - self.rel_atten = Biaffine( - rel_hidden_size, rel_hidden_size, num_labels, bias_x=True, bias_y=True - ) + self.rel_atten = Biaffine(rel_hidden_size, rel_hidden_size, num_labels, bias_x=True, bias_y=True) def forward(self, hidden_states, attention_mask=None): bs, seqlen = hidden_states.shape[:2] diff --git a/python/core/ltp_core/models/criterion/token.py b/python/core/ltp_core/models/criterion/token.py index 20af1dd4..3f14ebfe 100644 --- a/python/core/ltp_core/models/criterion/token.py +++ b/python/core/ltp_core/models/criterion/token.py @@ -73,14 +73,7 @@ def __init__(self, temperature_scheduler: Callable[[Tensor, Tensor], float]): super().__init__() self.temperature_scheduler = temperature_scheduler - def forward( - self, - result: TokenClassifierResult, - labels: Tensor, - targets: Tensor = None, - *args, - **kwargs - ) -> Tensor: + def forward(self, result: TokenClassifierResult, labels: Tensor, targets: Tensor = None, *args, **kwargs) -> Tensor: loss = super().forward(result, labels, **kwargs) if targets is not None: @@ -108,18 +101,10 @@ def __init__(self, temperature_scheduler: Callable[[Tensor, Tensor], float]): super().__init__() self.temperature_scheduler = temperature_scheduler - def forward( - self, - result: TokenClassifierResult, - labels: Tensor, - targets: Tensor = None, - *args, - **kwargs - ) -> Tensor: + def forward(self, result: TokenClassifierResult, labels: Tensor, targets: Tensor = None, *args, **kwargs) -> Tensor: loss = super().forward(result, labels, **kwargs) if targets is not None: - crf = result.crf logits = result.logits num_tags = logits.shape[-1] diff --git a/python/core/ltp_core/models/lit_model.py b/python/core/ltp_core/models/lit_model.py index 65cb55ba..4bb3a4ef 100644 --- a/python/core/ltp_core/models/lit_model.py +++ b/python/core/ltp_core/models/lit_model.py @@ -57,9 +57,7 @@ def __init__( metrics = instantiate(metrics) # must use module dict - metrics = ModuleDict( - {task: MetricCollection(metric, prefix=f"{task}/") for task, metric in metrics.items()} - ) + metrics = ModuleDict({task: MetricCollection(metric, prefix=f"{task}/") for task, metric in metrics.items()}) self.train_metrics = metrics self.val_metrics = deepcopy(metrics) self.test_metrics = deepcopy(metrics) @@ -216,11 +214,7 @@ def num_training_steps(self) -> int: limit_batches = self.trainer.limit_train_batches batches = len(self.trainer.datamodule.train_dataloader()) - batches = ( - min(batches, limit_batches) - if isinstance(limit_batches, int) - else int(limit_batches * batches) - ) + batches = min(batches, limit_batches) if isinstance(limit_batches, int) else int(limit_batches * batches) num_devices = max(1, self.trainer.num_devices) effective_accum = self.trainer.accumulate_grad_batches * num_devices return (batches // effective_accum) * self.trainer.max_epochs diff --git a/python/core/ltp_core/models/metrics/token.py b/python/core/ltp_core/models/metrics/token.py index 20c5384d..7eeaa695 100644 --- a/python/core/ltp_core/models/metrics/token.py +++ b/python/core/ltp_core/models/metrics/token.py @@ -51,8 +51,7 @@ def update(self, result: TokenClassifierResult, labels: Tensor, **kwargs) -> Non labels = labels.cpu().numpy() labels = [ - [self.labels[tag] for tag, mask in zip(tags, masks) if mask] - for tags, masks in zip(labels, attention_mask) + [self.labels[tag] for tag, mask in zip(tags, masks) if mask] for tags, masks in zip(labels, attention_mask) ] if crf is None: @@ -109,8 +108,7 @@ def update(self, result: TokenClassifierResult, labels: Tensor, **kwargs) -> Non labels = labels.cpu().numpy() labels = [ - [self.labels[tag] for tag, mask in zip(tags, masks) if mask] - for tags, masks in zip(labels, attention_mask) + [self.labels[tag] for tag, mask in zip(tags, masks) if mask] for tags, masks in zip(labels, attention_mask) ] if crf is None: diff --git a/python/core/ltp_core/models/nn/crf.py b/python/core/ltp_core/models/nn/crf.py index 1de9da75..453a2c08 100644 --- a/python/core/ltp_core/models/nn/crf.py +++ b/python/core/ltp_core/models/nn/crf.py @@ -117,9 +117,7 @@ def forward( return llh.sum() / mask.type_as(emissions).sum() @torch.jit.export - def decode( - self, emissions: torch.Tensor, mask: Optional[torch.ByteTensor] = None - ) -> List[List[int]]: + def decode(self, emissions: torch.Tensor, mask: Optional[torch.ByteTensor] = None) -> List[List[int]]: """Find the most likely tag sequence using Viterbi algorithm. Args: @@ -150,10 +148,7 @@ def _validate( if emissions.dim() != 3: raise ValueError(f"emissions must have dimension of 3, got {emissions.dim()}") if emissions.size(2) != self.num_tags: - raise ValueError( - f"expected last dimension of emissions is {self.num_tags}, " - f"got {emissions.size(2)}" - ) + raise ValueError(f"expected last dimension of emissions is {self.num_tags}, " f"got {emissions.size(2)}") if tags is not None: if emissions.shape[0] != tags.shape[0] or emissions.shape[1] != tags.shape[1]: @@ -173,9 +168,7 @@ def _validate( if not no_empty_seq and not no_empty_seq_bf: raise ValueError("mask of the first timestep must all be on") - def _compute_score( - self, emissions: torch.Tensor, tags: torch.LongTensor, mask: torch.ByteTensor - ) -> torch.Tensor: + def _compute_score(self, emissions: torch.Tensor, tags: torch.LongTensor, mask: torch.ByteTensor) -> torch.Tensor: # emissions: (seq_length, batch_size, num_tags) # tags: (seq_length, batch_size) # mask: (seq_length, batch_size) @@ -262,9 +255,7 @@ def _compute_normalizer(self, emissions: torch.Tensor, mask: torch.ByteTensor) - # shape: (batch_size,) return torch.logsumexp(score, dim=1) - def _viterbi_decode( - self, emissions: torch.FloatTensor, mask: torch.ByteTensor - ) -> List[List[int]]: + def _viterbi_decode(self, emissions: torch.FloatTensor, mask: torch.ByteTensor) -> List[List[int]]: # emissions: (seq_length, batch_size, num_tags) # mask: (seq_length, batch_size) assert emissions.dim() == 3 and mask.dim() == 2 diff --git a/python/core/ltp_core/models/nn/relative_transformer.py b/python/core/ltp_core/models/nn/relative_transformer.py index e3db54b6..9e552875 100644 --- a/python/core/ltp_core/models/nn/relative_transformer.py +++ b/python/core/ltp_core/models/nn/relative_transformer.py @@ -38,9 +38,7 @@ def get_embedding(self, num_embeddings, embedding_dim): half_dim = embedding_dim // 2 emb = math.log(10000) / (half_dim - 1) emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb) - emb = torch.arange(-num_embeddings // 2, num_embeddings // 2, dtype=torch.float).unsqueeze( - 1 - ) * emb.unsqueeze(0) + emb = torch.arange(-num_embeddings // 2, num_embeddings // 2, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0) emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1) if embedding_dim % 2 == 1: # zero pad @@ -51,9 +49,7 @@ def get_embedding(self, num_embeddings, embedding_dim): def forward(self, inputs: Tensor): """Input is expected to be of size [bsz x seqlen].""" bsz, seq_len = inputs.size() - positions = ( - torch.arange(-seq_len, seq_len).to(inputs.device).long() + self.origin_shift - ) # 2*seq_len + positions = torch.arange(-seq_len, seq_len).to(inputs.device).long() + self.origin_shift # 2*seq_len embed = self.weights.index_select(0, positions.long()).detach() return embed @@ -82,12 +78,8 @@ def __init__( self.dropout_layer = nn.Dropout(dropout) self.pos_embed = RelativeEmbedding(input_size // num_head, max_length) if r_r_bias is None or r_w_bias is None: # Biases are not shared - self.r_r_bias = nn.Parameter( - nn.init.xavier_normal_(torch.zeros(num_head, input_size // num_head)) - ) - self.r_w_bias = nn.Parameter( - nn.init.xavier_normal_(torch.zeros(num_head, input_size // num_head)) - ) + self.r_r_bias = nn.Parameter(nn.init.xavier_normal_(torch.zeros(num_head, input_size // num_head))) + self.r_w_bias = nn.Parameter(nn.init.xavier_normal_(torch.zeros(num_head, input_size // num_head))) else: self.r_r_bias = r_r_bias # r_r_bias就是v self.r_w_bias = r_w_bias # r_w_bias就是u @@ -111,12 +103,8 @@ def forward(self, x, mask): rw_head_q = q + self.r_r_bias[:, None] AC = torch.einsum("bnqd,bnkd->bnqk", rw_head_q, k) # b x n x l x d, n是head - D_ = torch.einsum("nd,ld->nl", self.r_w_bias, pos_embed)[ - None, :, None - ] # head x 2max_len, 每个head对位置的bias - B_ = torch.einsum( - "bnqd,ld->bnql", q, pos_embed - ) # bsz x head x max_len x 2max_len,每个query对每个shift的偏移 + D_ = torch.einsum("nd,ld->nl", self.r_w_bias, pos_embed)[None, :, None] # head x 2max_len, 每个head对位置的bias + B_ = torch.einsum("bnqd,ld->bnql", q, pos_embed) # bsz x head x max_len x 2max_len,每个query对每个shift的偏移 BD = B_ + D_ # bsz x head x max_len x 2max_len, 要转换为bsz x head x max_len x max_len BD = self._shift(BD) attn = AC + BD @@ -125,9 +113,7 @@ def forward(self, x, mask): attn = F.softmax(attn, dim=-1) attn = self.dropout_layer(attn) - v = ( - torch.matmul(attn, v).transpose(1, 2).reshape(batch_size, max_len, d_model) - ) # b x n x l x d + v = torch.matmul(attn, v).transpose(1, 2).reshape(batch_size, max_len, d_model) # b x n x l x d return v @@ -146,9 +132,7 @@ def _shift(self, BD): """ bsz, n_head, max_len, _ = BD.size() zero_pad = BD.new_zeros(bsz, n_head, max_len, 1) - BD = torch.cat([BD, zero_pad], dim=-1).view( - bsz, n_head, -1, max_len - ) # bsz x n_head x (2max_len+1) x max_len + BD = torch.cat([BD, zero_pad], dim=-1).view(bsz, n_head, -1, max_len) # bsz x n_head x (2max_len+1) x max_len BD = BD[:, :, :-1, :].view(bsz, n_head, max_len, -1) # bsz x n_head x 2max_len x max_len BD = BD[:, :, :, :max_len] return BD @@ -176,9 +160,7 @@ def __init__( self.after_norm = after_norm self.norm1 = nn.LayerNorm(input_size) self.norm2 = nn.LayerNorm(input_size) - self.self_attn = RelativeMultiHeadAttn( - input_size, num_heads, dropout=dropout, max_length=max_length - ) + self.self_attn = RelativeMultiHeadAttn(input_size, num_heads, dropout=dropout, max_length=max_length) self.ffn = MLP( [input_size, hidden_size, input_size], activation=nn.LeakyReLU, diff --git a/python/core/ltp_core/models/optimization/layer_lrs.py b/python/core/ltp_core/models/optimization/layer_lrs.py index 69e50c8d..06f6b9e9 100644 --- a/python/core/ltp_core/models/optimization/layer_lrs.py +++ b/python/core/ltp_core/models/optimization/layer_lrs.py @@ -30,9 +30,7 @@ def get_layer_lrs_with_crf( else: raise Exception("Not Recommend!!!") - if is_transformer and any( - x in name for x in ["bias", "LayerNorm.bias", "LayerNorm.weight"] - ): + if is_transformer and any(x in name for x in ["bias", "LayerNorm.bias", "LayerNorm.weight"]): if temp_no_decay_groups[depth] is None: temp_no_decay_groups[depth] = [] temp_no_decay_groups[depth].append(parameters) diff --git a/python/core/ltp_core/models/utils/instantiate.py b/python/core/ltp_core/models/utils/instantiate.py index d673105f..95954de1 100644 --- a/python/core/ltp_core/models/utils/instantiate.py +++ b/python/core/ltp_core/models/utils/instantiate.py @@ -30,11 +30,7 @@ def instantiate(config, target="_ltp_target_", partial="_ltp_partial_"): target_callable = find_callable(target_path) is_partial = config.get(partial, False) - target_args = { - key: instantiate(value) - for key, value in config.items() - if key not in [target, partial] - } + target_args = {key: instantiate(value) for key, value in config.items() if key not in [target, partial]} if is_partial: return functools.partial(target_callable, **target_args) @@ -54,11 +50,7 @@ def instantiate_omega(config, target="_ltp_target_", partial="_ltp_partial_"): target_callable = find_callable(target_path) is_partial = config.get(partial, False) - target_args = { - key: instantiate_omega(value) - for key, value in config.items() - if key not in [target, partial] - } + target_args = {key: instantiate_omega(value) for key, value in config.items() if key not in [target, partial]} if is_partial: return functools.partial(target_callable, **target_args) diff --git a/python/core/ltp_core/utils/utils.py b/python/core/ltp_core/utils/utils.py index 2b94deec..40a38f3a 100644 --- a/python/core/ltp_core/utils/utils.py +++ b/python/core/ltp_core/utils/utils.py @@ -29,7 +29,6 @@ def task_wrapper(task_func: Callable) -> Callable: """ def wrap(cfg: DictConfig): - # apply extra utilities extras(cfg) @@ -150,12 +149,8 @@ def log_hyperparameters(object_dict: dict) -> None: # save number of model parameters hparams["model/params/total"] = sum(p.numel() for p in model.parameters()) - hparams["model/params/trainable"] = sum( - p.numel() for p in model.parameters() if p.requires_grad - ) - hparams["model/params/non_trainable"] = sum( - p.numel() for p in model.parameters() if not p.requires_grad - ) + hparams["model/params/trainable"] = sum(p.numel() for p in model.parameters() if p.requires_grad) + hparams["model/params/non_trainable"] = sum(p.numel() for p in model.parameters() if not p.requires_grad) hparams["datamodule"] = cfg["datamodule"] hparams["trainer"] = cfg["trainer"] diff --git a/python/core/tests/test_crf.py b/python/core/tests/test_crf.py index af8de763..7fe349a8 100644 --- a/python/core/tests/test_crf.py +++ b/python/core/tests/test_crf.py @@ -110,8 +110,7 @@ def test_works_with_mask(self): emission, tag = emission[:seq_len], tag[:seq_len] numerator = compute_score(crf, emission, tag) all_scores = [ - compute_score(crf, emission, t) - for t in itertools.product(range(crf.num_tags), repeat=seq_len) + compute_score(crf, emission, t) for t in itertools.product(range(crf.num_tags), repeat=seq_len) ] denominator = math.log(sum(math.exp(s) for s in all_scores)) manual_llh += numerator - denominator @@ -180,8 +179,7 @@ def test_reduction_none(self): for emission, tag in zip(emissions, tags): numerator = compute_score(crf, emission, tag) all_scores = [ - compute_score(crf, emission, t) - for t in itertools.product(range(crf.num_tags), repeat=seq_length) + compute_score(crf, emission, t) for t in itertools.product(range(crf.num_tags), repeat=seq_length) ] denominator = math.log(sum(math.exp(s) for s in all_scores)) manual_llh.append(numerator - denominator) @@ -213,8 +211,7 @@ def test_reduction_mean(self): for emission, tag in zip(emissions, tags): numerator = compute_score(crf, emission, tag) all_scores = [ - compute_score(crf, emission, t) - for t in itertools.product(range(crf.num_tags), repeat=seq_length) + compute_score(crf, emission, t) for t in itertools.product(range(crf.num_tags), repeat=seq_length) ] denominator = math.log(sum(math.exp(s) for s in all_scores)) manual_llh += numerator - denominator @@ -251,8 +248,7 @@ def test_reduction_token_mean(self): emission, tag = emission[:seq_len], tag[:seq_len] numerator = compute_score(crf, emission, tag) all_scores = [ - compute_score(crf, emission, t) - for t in itertools.product(range(crf.num_tags), repeat=seq_len) + compute_score(crf, emission, t) for t in itertools.product(range(crf.num_tags), repeat=seq_len) ] denominator = math.log(sum(math.exp(s) for s in all_scores)) manual_llh += numerator - denominator @@ -297,8 +293,7 @@ def test_scripted_forward(self): llh = crf(emissions, tags, mask=mask) llh_scripted = crf_script(emissions, tags, mask=mask) assert torch.equal(llh, llh_scripted), ( - f"scripted crf forward output {llh_scripted} " - f"not matching non-scripted forward output {llh}" + f"scripted crf forward output {llh_scripted} " f"not matching non-scripted forward output {llh}" ) # Test scripted forward works without mask @@ -312,8 +307,7 @@ def test_scripted_forward(self): llh_mask = crf(emissions, tags, mask=torch.ones_like(tags).byte()) llh_mask_script = crf_script(emissions, tags, mask=torch.ones_like(tags).byte()) assert torch.equal(llh_mask, llh_mask_script), ( - f"scripted crf forward output {llh_mask_script} " - f"not matching non-scripted forward output {llh_mask}" + f"scripted crf forward output {llh_mask_script} " f"not matching non-scripted forward output {llh_mask}" ) # Test scripted forward in batched setting @@ -325,8 +319,7 @@ def test_scripted_forward(self): llh = crf(emissions_batch, tags_batch) llh_script = crf_script(emissions_batch, tags_batch) assert torch.equal(llh_script, llh), ( - f"scripted crf forward output {llh_script} " - f"not matching non-scripted forward output {llh}" + f"scripted crf forward output {llh_script} " f"not matching non-scripted forward output {llh}" ) # Test scripted forward when reduction is None, mean, token_mean @@ -337,22 +330,19 @@ def test_scripted_forward(self): llh = crf(emissions, tags, reduction="none") llh_script = crf_script(emissions, tags, reduction="none") assert torch.equal(llh_script, llh), ( - f"scripted crf forward output {llh_script} " - f"not matching non-scripted forward output {llh}" + f"scripted crf forward output {llh_script} " f"not matching non-scripted forward output {llh}" ) llh = crf(emissions, tags, reduction="mean") llh_script = crf_script(emissions, tags, reduction="mean") assert torch.equal(llh_script, llh), ( - f"scripted crf forward output {llh_script} " - f"not matching non-scripted forward output {llh}" + f"scripted crf forward output {llh_script} " f"not matching non-scripted forward output {llh}" ) mask = torch.tensor([[1, 1, 1], [1, 1, 0]], dtype=torch.uint8).transpose(0, 1) llh = crf(emissions, tags, mask=mask, reduction="token_mean") llh_script = crf_script(emissions, tags, mask=mask, reduction="token_mean") assert torch.equal(llh_script, llh), ( - f"scripted crf forward output {llh_script} " - f"not matching non-scripted forward output {llh}" + f"scripted crf forward output {llh_script} " f"not matching non-scripted forward output {llh}" ) # Test scripted forward when running batch first mode @@ -369,8 +359,7 @@ def test_scripted_forward(self): llh_bf = crf_bf(emissions, tags) llh_bf_script = crf_bf_script(emissions, tags) assert torch.equal(llh_bf_script, llh_bf), ( - f"scripted crf forward output {llh_bf_script} " - f"not matching non-scripted forward output {llh_bf}" + f"scripted crf forward output {llh_bf_script} " f"not matching non-scripted forward output {llh_bf}" ) def test_emissions_has_bad_number_of_dimension(self): @@ -389,9 +378,9 @@ def test_emissions_and_tags_size_mismatch(self): with pytest.raises(ValueError) as excinfo: crf(emissions, tags) - assert ( - "the first two dimensions of emissions and tags must match, " "got (1, 2) and (2, 2)" - ) in str(excinfo.value) + assert ("the first two dimensions of emissions and tags must match, " "got (1, 2) and (2, 2)") in str( + excinfo.value + ) def test_emissions_last_dimension_not_equal_to_number_of_tags(self): emissions = torch.randn(1, 2, 3) @@ -527,8 +516,7 @@ def test_scripted_decode(self): best_tags = crf.decode(emissions, mask=mask) best_tags_scripted = crf_script.decode(emissions, mask=mask) assert best_tags == best_tags_scripted, ( - f"scripted decode output {best_tags_scripted} " - f"doesn't match non-scripted output {best_tags}" + f"scripted decode output {best_tags_scripted} " f"doesn't match non-scripted output {best_tags}" ) # Test decoding without a mask @@ -548,8 +536,7 @@ def test_scripted_decode(self): batched = crf.decode(emissions_batched, mask=mask_batched) batched_scripted = crf_script.decode(emissions_batched, mask=mask_batched) assert batched == batched_scripted, ( - f"scripted decode output {batched_scripted} " - f"doesn't match non-scripted output {batched}" + f"scripted decode output {batched_scripted} " f"doesn't match non-scripted output {batched}" ) # Test batch first decode @@ -565,8 +552,7 @@ def test_scripted_decode(self): best_tags_bf = crf_bf.decode(emissions) best_tags_bf_script = crf_bf_script.decode(emissions) assert best_tags_bf == best_tags_bf_script, ( - f"scripted decode output {best_tags_bf_script} " - f"doesn't match non-scripted decode output {best_tags_bf}" + f"scripted decode output {best_tags_bf_script} " f"doesn't match non-scripted decode output {best_tags_bf}" ) def test_emissions_has_bad_number_of_dimension(self): @@ -592,9 +578,9 @@ def test_emissions_and_mask_size_mismatch(self): with pytest.raises(ValueError) as excinfo: crf.decode(emissions, mask=mask) - assert ( - "the first two dimensions of emissions and mask must match, " "got (1, 2) and (2, 2)" - ) in str(excinfo.value) + assert ("the first two dimensions of emissions and mask must match, " "got (1, 2) and (2, 2)") in str( + excinfo.value + ) def test_first_timestep_mask_is_not_all_on(self): emissions = torch.randn(3, 2, 4) diff --git a/python/extension/Cargo.toml b/python/extension/Cargo.toml index aefea553..29630a07 100644 --- a/python/extension/Cargo.toml +++ b/python/extension/Cargo.toml @@ -21,7 +21,7 @@ rayon = { version = "1.7" } rayon-cond = { version = "0.3" } anyhow = { version = "1.0" } serde = { version = "1.0", features = ["derive"] } -pyo3 = { version = "0.20", features = ["extension-module", "anyhow", "serde"] } +pyo3 = { version = "0.21", features = ["extension-module", "anyhow", "serde"] } mimalloc = { version = "0.1", default-features = false, optional = true } [dependencies.ltp] diff --git a/python/extension/utils/stub.py b/python/extension/utils/stub.py index c7ec2fa4..82a421cc 100644 --- a/python/extension/utils/stub.py +++ b/python/extension/utils/stub.py @@ -46,9 +46,7 @@ def fn_predicate(obj): def get_module_members(module, with_module=False): if with_module: - members = [ - member for name, member in inspect.getmembers(module) if not name.startswith("_") - ] + members = [member for name, member in inspect.getmembers(module) if not name.startswith("_")] else: members = [ member @@ -88,7 +86,7 @@ def pyi_file(obj, indent=""): body += f"{indent + INDENT}pass\n" body += "\n" - for (name, fn) in fns: + for name, fn in fns: body += pyi_file(fn, indent=indent) if not body: @@ -141,9 +139,7 @@ def do_black(content, is_pyi): def write(module, directory, origin, check=False): - submodules = [ - (name, member) for name, member in inspect.getmembers(module) if inspect.ismodule(member) - ] + submodules = [(name, member) for name, member in inspect.getmembers(module) if inspect.ismodule(member)] filename = os.path.join(directory, f"{origin}.pyi") pyi_content = pyi_file(module) @@ -152,9 +148,7 @@ def write(module, directory, origin, check=False): if check: with open(filename) as f: data = f.read() - assert ( - data == pyi_content - ), f"The content of {filename} seems outdated, please run `python stub.py`" + assert data == pyi_content, f"The content of {filename} seems outdated, please run `python stub.py`" else: with open(filename, "w") as f: f.write(pyi_content) @@ -177,9 +171,7 @@ def write(module, directory, origin, check=False): if check: with open(filename) as f: data = f.read() - assert ( - data == py_content - ), f"The content of {filename} seems outdated, please run `python stub.py`" + assert data == py_content, f"The content of {filename} seems outdated, please run `python stub.py`" else: with open(filename, "w") as f: f.write(py_content) diff --git a/python/interface/examples/issues.py b/python/interface/examples/issues.py index 921f44c4..85490be9 100644 --- a/python/interface/examples/issues.py +++ b/python/interface/examples/issues.py @@ -91,8 +91,17 @@ def issue686(): print(e) +def issue693(): + from ltp import LTP + ltp = LTP("LTP/tiny") + print(ltp.pipeline( + ["视觉Transformers通过将图像区域表示为转换后的tokens并通过注意力权重整合它们来提取视觉信息。"], + tasks=["cws"]) + ) + + def main(): - issue686() + issue693() if __name__ == "__main__": diff --git a/python/interface/examples/server.py b/python/interface/examples/server.py new file mode 100644 index 00000000..43ea160e --- /dev/null +++ b/python/interface/examples/server.py @@ -0,0 +1,161 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -* +# Author: Yunlong Feng + +""" +LTP Server 是对 LTP 的一个简单包装,依赖于 tornado,使用方式如下: +.. code-block:: bash + pip install ltp, tornado + python tools/server.py serve +""" + +import sys +import json +import logging +from typing import List + +import torch + +from tornado import ioloop +from tornado.httpserver import HTTPServer +from tornado.web import Application, RequestHandler +from tornado.log import app_log, gen_log, access_log, LogFormatter +from fire import Fire + +from ltp import LTP + + +class LTPHandler(RequestHandler): + def set_default_headers(self): + self.set_header("Access-Control-Allow-Origin", "*") + self.set_header('Access-Control-Allow-Headers', 'Content-Type') + self.set_header('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, PATCH, OPTIONS') + self.set_header('Content-Type', 'application/json;charset=UTF-8') + + def initialize(self, ltp): + self.set_default_headers() + self.ltp = ltp + + def post(self): + try: + print(self.request.body.decode('utf-8')) + text = json.loads(self.request.body.decode('utf-8'))['text'] + # print(text) + result = self.ltp._predict([text]) + # print(result) + self.finish(result) + except Exception as e: + self.finish(self.ltp._predict(['服务器遇到错误!'])[0]) + + def options(self): + pass + + +class Server(object): + def __init__(self, path: str = 'LTP/tiny', batch_size: int = 50, device: str = None): + # 2024/6/1 7:9:45 adapt for "ltp==4.2.13" + self.ltp = LTP(path) + self.batch_size = batch_size + # 将模型移动到 GPU 上 + if device is None and torch.cuda.is_available(): + # ltp.cuda() + self.ltp.to("cuda") + elif device is not None: + self.ltp.to(device) + + def _predict(self, sentences: List[str]): + output = self.ltp.pipeline(sentences, tasks=["cws", "pos", "ner", "srl", "dep", "sdp", "sdpg"]) + + # https://github.com/HIT-SCIR/ltp/blob/main/python/interface/docs/quickstart.rst + # 需要注意的是,在依存句法当中,虚节点ROOT占据了0位置,因此节点的下标从1开始。 + id = 0 + offset = 0 + words = [] + for word, pos, parent, relation in \ + zip(output.cws[0], output.pos[0], output.dep[0]['head'], output.dep[0]['label']): + # print([id, word, pos, parent, relation]) + words.append({ + 'id': id, + 'length': len(word), + 'offset': offset, + 'text': word, + 'pos': pos, + 'parent': parent - 1, + 'relation': relation, + 'roles': [], + 'parents': [] + }) + id = id + 1 + offset = offset + len(word) + + for token_srl in output.srl[0]: + for argument in token_srl['arguments']: + # print(token_srl['index'], token_srl['predicate'], argument) + text = argument[1] + start = argument[2] + offset = words[start]['offset'] + words[token_srl['index']]['roles'].append({ + 'text': text, + 'offset': offset, + 'length': len(text), + 'type': argument[0] + }) + + start = 0 + for end, label in \ + zip(output.sdp[0]['head'], output.sdp[0]['label']): + words[start]['parents'].append({'parent': end - 1, 'relate': label}) + start = start + 1 + + nes = [] + for role, text, start, end in output.ner[0]: + nes.append({ + 'text': text, + 'offset': start, + 'ne': role.lower(), + 'length': len(text) + }) + + result = { + 'text': sentences[0], + 'nes': nes, + 'words': words + } + + return result + + def serve(self, port: int = 5000, n_process: int = None): + if n_process is None: + n_process = 1 if sys.platform == 'win32' else 8 + + fmt = LogFormatter(fmt='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', color=True) + root_logger = logging.getLogger() + + console_handler = logging.StreamHandler() + file_handler = logging.FileHandler('server.log') + + console_handler.setFormatter(fmt) + file_handler.setFormatter(fmt) + + root_logger.addHandler(console_handler) + root_logger.addHandler(file_handler) + + app_log.setLevel(logging.INFO) + gen_log.setLevel(logging.INFO) + access_log.setLevel(logging.INFO) + + # app_log.info("Model is loading...") + app_log.info("Model Has Been Loaded!") + + app = Application([ + (r"/.*", LTPHandler, dict(ltp=self)) + ]) + + server = HTTPServer(app) + server.bind(port) + server.start(n_process) + ioloop.IOLoop.instance().start() + + +if __name__ == '__main__': + Fire(Server) diff --git a/python/interface/ltp/generic.py b/python/interface/ltp/generic.py index 74d75c9a..defb6b2a 100644 --- a/python/interface/ltp/generic.py +++ b/python/interface/ltp/generic.py @@ -26,9 +26,7 @@ def __post_init__(self): if not len(class_fields): raise ValueError(f"{self.__class__.__name__} has no fields.") if not all(field.default is None for field in class_fields[1:]): - raise ValueError( - f"{self.__class__.__name__} should not have more than one required field." - ) + raise ValueError(f"{self.__class__.__name__} should not have more than one required field.") for field in class_fields: v = getattr(self, field.name) diff --git a/python/interface/ltp/interface.py b/python/interface/ltp/interface.py index 445a24f3..6faf9125 100644 --- a/python/interface/ltp/interface.py +++ b/python/interface/ltp/interface.py @@ -13,7 +13,6 @@ def LTP( pretrained_model_name_or_path="LTP/small", force_download: bool = False, - resume_download: bool = False, proxies: Dict = None, use_auth_token: Optional[str] = None, cache_dir: Optional[str] = None, @@ -55,9 +54,6 @@ def LTP( Whether to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether to delete incompletely received files. Will - attempt to resume the download if such a file exists. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', @@ -106,7 +102,6 @@ def LTP( cache_dir=cache_dir, force_download=force_download, proxies=proxies, - resume_download=resume_download, use_auth_token=use_auth_token, local_files_only=local_files_only, ) @@ -128,7 +123,6 @@ def LTP( cache_dir, force_download, proxies, - resume_download, local_files_only, use_auth_token, **model_kwargs, @@ -140,7 +134,6 @@ def LTP( cache_dir, force_download, proxies, - resume_download, local_files_only, use_auth_token, **model_kwargs, diff --git a/python/interface/ltp/legacy.py b/python/interface/ltp/legacy.py index b1d4cbda..06aba898 100644 --- a/python/interface/ltp/legacy.py +++ b/python/interface/ltp/legacy.py @@ -101,18 +101,12 @@ def pipeline( for idx, sent in enumerate(ner): words = sentences[idx] new_store.append( - [ - (tag, "".join(words[start : end + 1])) - for tag, start, end in get_entities(sent) - ] + [(tag, "".join(words[start : end + 1])) for tag, start, end in get_entities(sent)] ) ner = new_store else: words = args[0] - ner = [ - (tag, "".join(words[start : end + 1])) - for tag, start, end in get_entities(ner) - ] + ner = [(tag, "".join(words[start : end + 1])) for tag, start, end in get_entities(ner)] args = (*args, ner) else: raise ValueError(f"Invalid task: {task}") @@ -138,7 +132,6 @@ def _from_pretrained( cache_dir, force_download, proxies, - resume_download, local_files_only, use_auth_token, **model_kwargs, @@ -148,8 +141,7 @@ def _from_pretrained( if os.path.isdir(model_id): print("Loading weights from local directory") model_files = { - task: os.path.join(model_id, model_file) - for task, model_file in model_kwargs["config"]["tasks"].items() + task: os.path.join(model_id, model_file) for task, model_file in model_kwargs["config"]["tasks"].items() } else: model_files = { @@ -160,7 +152,6 @@ def _from_pretrained( cache_dir=cache_dir, force_download=force_download, proxies=proxies, - resume_download=resume_download, use_auth_token=use_auth_token, local_files_only=local_files_only, ) diff --git a/python/interface/ltp/module.py b/python/interface/ltp/module.py index b2b03747..a81d8732 100644 --- a/python/interface/ltp/module.py +++ b/python/interface/ltp/module.py @@ -58,9 +58,7 @@ def half(self) -> Module: self.__update_properties(dtype=torch.half) return super().half() - def __update_properties( - self, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None - ): + def __update_properties(self, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None): def apply_fn(module): if not isinstance(module, BaseModule): return diff --git a/python/interface/ltp/nerual.py b/python/interface/ltp/nerual.py index ef0d6491..959b81c0 100644 --- a/python/interface/ltp/nerual.py +++ b/python/interface/ltp/nerual.py @@ -196,7 +196,7 @@ def pipeline( words = sentences[idx] new_store.append( [ - (tag, "".join(words[start : end + 1])) + (tag, "".join(words[start : end + 1]), start, end) for tag, start, end in get_entities(sent) ] ) @@ -209,13 +209,11 @@ def pipeline( for item, predicate in enumerate(words): arguments = [ - (tag, "".join(words[start : end + 1])) + (tag, "".join(words[start : end + 1]), start, end) for tag, start, end in get_entities(sent[item]) ] if arguments: - new_store[-1].append( - {"predicate": predicate, "arguments": arguments} - ) + new_store[-1].append({"index": item, "predicate": predicate, "arguments": arguments}) store[task] = new_store if is_batch: @@ -302,10 +300,7 @@ def _cws_post( batch_first=True, ) words_attention_mask = torch.nn.utils.rnn.pad_sequence( - [ - torch.as_tensor([True for e in sent_entities], device=self.device) - for sent_entities in entities - ], + [torch.as_tensor([True for e in sent_entities], device=self.device) for sent_entities in entities], batch_first=True, ) hidden["word_index"] = words_idx @@ -435,10 +430,7 @@ def _dep_post( length = torch.sum(attention_mask, dim=1).view(-1).cpu().numpy() + 1 arcs = [sequence for sequence in eisner(s_arc.tolist(), length.tolist(), True)] rels = torch.argmax(s_rel[:, 1:], dim=-1).cpu().numpy() - rels = [ - [self.dep_vocab[rels[s, t, a]] for t, a in enumerate(arc)] - for s, arc in enumerate(arcs) - ] + rels = [[self.dep_vocab[rels[s, t, a]] for t, a in enumerate(arc)] for s, arc in enumerate(arcs)] return [{"head": arc, "label": rel} for arc, rel in zip(arcs, rels)] @@ -473,10 +465,7 @@ def _sdp_post( if tree: rels = torch.argmax(s_rel[:, 1:], dim=-1).cpu().numpy() - rels = [ - [self.sdp_vocab[rels[s, t, a]] for t, a in enumerate(arc)] - for s, arc in enumerate(e_arcs) - ] + rels = [[self.sdp_vocab[rels[s, t, a]] for t, a in enumerate(arc)] for s, arc in enumerate(e_arcs)] return [{"head": arc, "label": rel} for arc, rel in zip(e_arcs, rels)] for b, arc in enumerate(e_arcs): @@ -522,7 +511,6 @@ def _from_pretrained( cache_dir, force_download, proxies, - resume_download, local_files_only, use_auth_token, map_location="cpu", @@ -536,9 +524,7 @@ def _from_pretrained( if os.path.isdir(model_id): print("Loading weights from local directory") model_file = os.path.join(model_id, PYTORCH_WEIGHTS_NAME) - tokenizer = AutoTokenizer.from_pretrained( - model_id, config=ltp.model.backbone.config, use_fast=True - ) + tokenizer = AutoTokenizer.from_pretrained(model_id, config=ltp.model.backbone.config, use_fast=True) else: model_file = cls.download( repo_id=model_id, @@ -547,7 +533,6 @@ def _from_pretrained( cache_dir=cache_dir, force_download=force_download, proxies=proxies, - resume_download=resume_download, use_auth_token=use_auth_token, local_files_only=local_files_only, ) @@ -558,7 +543,6 @@ def _from_pretrained( cache_dir=cache_dir, force_download=force_download, proxies=proxies, - resume_download=resume_download, use_auth_token=use_auth_token, local_files_only=local_files_only, use_fast=True, diff --git a/rust/ltp/Cargo.toml b/rust/ltp/Cargo.toml index 7edba929..17c3d56a 100644 --- a/rust/ltp/Cargo.toml +++ b/rust/ltp/Cargo.toml @@ -34,7 +34,7 @@ required-features = ["serialization", "parallel"] [dependencies] anyhow = "1" num-traits = "0.2" -itertools = "0.12" +itertools = "0.13" cedarwood = "0.4"