Skip to content

Commit

Permalink
增补 ner, srl 中的 start, end
Browse files Browse the repository at this point in the history
  • Loading branch information
AlongWY committed Jun 8, 2024
1 parent 17f689c commit 491a5c1
Show file tree
Hide file tree
Showing 32 changed files with 280 additions and 271 deletions.
1 change: 0 additions & 1 deletion python/core/ltp_core/algorithms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from ltp_extension.algorithms import eisner as rust_eisner

def eisner(scores, mask, remove_root=False):

scores = scores.view(-1).cpu().numpy()
length = torch.sum(mask, dim=1).cpu().numpy()

Expand Down
4 changes: 1 addition & 3 deletions python/core/ltp_core/datamodules/adapters/postagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs):

os.environ["TOKENIZERS_PARALLELISM"] = "true"
dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir)
dataset = dataset.remove_columns(
["id", "lemma", "upos", "feats", "head", "deprel", "deps", "misc"]
)
dataset = dataset.remove_columns(["id", "lemma", "upos", "feats", "head", "deprel", "deps", "misc"])
dataset = dataset.rename_column("xpos", "labels")
dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True)
dataset = dataset.filter(lambda x: not x["overflow"])
Expand Down
4 changes: 1 addition & 3 deletions python/core/ltp_core/datamodules/adapters/segmention.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,7 @@ def build_dataset(data_dir, task_name, tokenizer, max_length=512, mode="bmes", *

os.environ["TOKENIZERS_PARALLELISM"] = "true"
dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir)
dataset = dataset.remove_columns(
["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc"]
)
dataset = dataset.remove_columns(["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc"])
if mode == "bmes":
dataset = dataset.map(
lambda examples: tokenize(examples, tokenizer, max_length, length2bmes),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs):

os.environ["TOKENIZERS_PARALLELISM"] = "true"
dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir)
dataset = dataset.remove_columns(
["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "misc"]
)
dataset = dataset.remove_columns(["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "misc"])
dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True)
dataset = dataset.filter(lambda x: not x["overflow"])
dataset.set_format(
Expand Down
8 changes: 2 additions & 6 deletions python/core/ltp_core/datamodules/components/bio.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,13 @@ def _info(self):
def _split_generators(self, dl_manager):
"""We handle string, list and dicts in datafiles."""
if not self.config.data_files:
raise ValueError(
f"At least one data file must be specified, but got data_files={self.config.data_files}"
)
raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
data_files = dl_manager.download_and_extract(self.config.data_files)
if isinstance(data_files, (str, list, tuple)):
files = data_files
if isinstance(files, str):
files = [files]
return [
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})
]
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})]
splits = []
for split_name, files in data_files.items():
if isinstance(files, str):
Expand Down
48 changes: 20 additions & 28 deletions python/core/ltp_core/datamodules/components/conllu.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,7 @@ def build_vocabs(data_dir, *files, min_freq=5):
counter.update(itertools.chain(*values[row]))
elif "deps" == name:
try:
deps = [
[label.split(":", maxsplit=1)[1] for label in dep.split("|")]
for dep in values[row]
]
deps = [[label.split(":", maxsplit=1)[1] for label in dep.split("|")] for dep in values[row]]
counter.update(itertools.chain(*deps))
except Exception:
counter.update("_")
Expand Down Expand Up @@ -166,17 +163,13 @@ def _info(self):
def _split_generators(self, dl_manager):
"""We handle string, list and dicts in datafiles."""
if not self.config.data_files:
raise ValueError(
f"At least one data file must be specified, but got data_files={self.config.data_files}"
)
raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
data_files = dl_manager.download_and_extract(self.config.data_files)
if isinstance(data_files, (str, list, tuple)):
files = data_files
if isinstance(files, str):
files = [files]
return [
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})
]
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})]
splits = []
for split_name, files in data_files.items():
if isinstance(files, str):
Expand All @@ -189,13 +182,9 @@ def _generate_examples(self, files):
logging.info("⏳ Generating examples from = %s", filename)
for line_num, block in iter_blocks(filename=filename):
# last example
id, words, lemma, upos, xpos, feats, head, deprel, deps, misc = (
list(value) for value in zip(*block)
)
id, words, lemma, upos, xpos, feats, head, deprel, deps, misc = (list(value) for value in zip(*block))
if self.config.deps:
deps = [
[label.split(":", maxsplit=1) for label in dep.split("|")] for dep in deps
]
deps = [[label.split(":", maxsplit=1) for label in dep.split("|")] for dep in deps]
deps = [
[{"id": depid, "head": int(label[0]), "rel": label[-1]} for label in dep]
for depid, dep in enumerate(deps)
Expand All @@ -204,18 +193,21 @@ def _generate_examples(self, files):
if any([dep["head"] >= len(words) for dep in deps]):
continue

yield line_num, {
"id": id,
"form": words,
"lemma": lemma,
"upos": upos,
"xpos": xpos,
"feats": feats,
"head": head,
"deprel": deprel,
"deps": deps,
"misc": misc,
}
yield (
line_num,
{
"id": id,
"form": words,
"lemma": lemma,
"upos": upos,
"xpos": xpos,
"feats": feats,
"head": head,
"deprel": deprel,
"deps": deps,
"misc": misc,
},
)


def main():
Expand Down
25 changes: 11 additions & 14 deletions python/core/ltp_core/datamodules/components/srl.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,7 @@ def _info(self):
{
"form": datasets.Sequence(datasets.Value("string")),
"predicate": datasets.Sequence(create_feature(feats["predicate"])),
"arguments": datasets.Sequence(
datasets.Sequence(create_feature(feats["arguments"]))
),
"arguments": datasets.Sequence(datasets.Sequence(create_feature(feats["arguments"]))),
}
),
supervised_keys=None,
Expand All @@ -118,17 +116,13 @@ def _info(self):
def _split_generators(self, dl_manager):
"""We handle string, list and dicts in datafiles."""
if not self.config.data_files:
raise ValueError(
f"At least one data file must be specified, but got data_files={self.config.data_files}"
)
raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
data_files = dl_manager.download_and_extract(self.config.data_files)
if isinstance(data_files, (str, list, tuple)):
files = data_files
if isinstance(files, str):
files = [files]
return [
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})
]
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})]
splits = []
for split_name, files in data_files.items():
if isinstance(files, str):
Expand All @@ -143,11 +137,14 @@ def _generate_examples(self, files):
# last example
words, predicate, *roles = (list(value) for value in zip(*block))

yield line_num, {
"form": words,
"predicate": predicate,
"arguments": roles,
}
yield (
line_num,
{
"form": words,
"predicate": predicate,
"arguments": roles,
},
)


def main():
Expand Down
12 changes: 4 additions & 8 deletions python/core/ltp_core/datamodules/multi_task_datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ def __init__(self, tokenizer, datamodules, tau=0.8, num_workers=None, pin_memory

# data transformations
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
self.datamodules = {
task: info.load(tokenizer=self.tokenizer) for task, info in datamodules.items()
}
self.datamodules = {task: info.load(tokenizer=self.tokenizer) for task, info in datamodules.items()}
self.data_train: Optional[Dict[str, Dataset]] = {
name: dataset[datasets.Split.TRAIN] for name, dataset in self.datamodules.items()
}
Expand Down Expand Up @@ -87,14 +85,12 @@ def train_dataloader(self):
dataset=dataset,
collate_fn=collate,
batch_size=self.hparams.datamodules[name].batch_size,
num_workers=self.hparams.num_workers
or self.hparams.datamodules[name].num_workers,
pin_memory=self.hparams.pin_memory
or self.hparams.datamodules[name].pin_memory,
num_workers=self.hparams.num_workers or self.hparams.datamodules[name].num_workers,
pin_memory=self.hparams.pin_memory or self.hparams.datamodules[name].pin_memory,
shuffle=True,
)
for name, dataset in self.data_train.items()
}
},
)

def val_dataloader(self):
Expand Down
6 changes: 1 addition & 5 deletions python/core/ltp_core/datamodules/utils/collate.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,7 @@ def collate(batch):
return torch.stack(batch, 0, out=out)
except Exception:
return torch.nn.utils.rnn.pad_sequence(batch, batch_first=True)
elif (
elem_type.__module__ == "numpy"
and elem_type.__name__ != "str_"
and elem_type.__name__ != "string_"
):
elif elem_type.__module__ == "numpy" and elem_type.__name__ != "str_" and elem_type.__name__ != "string_":
elem = batch[0]
if elem_type.__name__ == "ndarray":
# array of string classes and object
Expand Down
6 changes: 2 additions & 4 deletions python/core/ltp_core/datamodules/utils/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,12 @@ def load_dataset(
builder_cls: type,
config_name: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[
Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
] = None,
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
split: Optional[Union[str, Split]] = None,
cache_dir: Optional[str] = None,
features: Optional[Features] = None,
save_infos: bool = False,
**config_kwargs
**config_kwargs,
) -> Union[DatasetDict, Dataset]:
# Instantiate the dataset builder
builder_instance: DatasetBuilder = builder_cls(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@ def __init__(self, tau=1.0, **dataloaders):
self.dataloaders = dataloaders

Z = sum(pow(v, tau) for v in self.dataloader_sizes.values())
self.tasknames, self.sampling_weights = zip(
*((k, pow(v, tau) / Z) for k, v in self.dataloader_sizes.items())
)
self.tasknames, self.sampling_weights = zip(*((k, pow(v, tau) / Z) for k, v in self.dataloader_sizes.items()))
self.dataiters = {k: cycle(v) for k, v in dataloaders.items()}

@property
Expand Down
4 changes: 1 addition & 3 deletions python/core/ltp_core/datamodules/utils/vocab_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@ def vocab_builder(func):
def func_wrapper(config: BuilderConfig, **kwargs):
"""We handle string, list and dicts in datafiles."""
if not config.data_files:
raise ValueError(
f"At least one data file must be specified, but got data_files={config.data_files}"
)
raise ValueError(f"At least one data file must be specified, but got data_files={config.data_files}")
data_files = config.data_files
if isinstance(data_files, (str, list, tuple)):
files = data_files
Expand Down
4 changes: 1 addition & 3 deletions python/core/ltp_core/models/components/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,7 @@ def __init__(
)

self.arc_atten = Biaffine(arc_hidden_size, arc_hidden_size, 1, bias_x=True, bias_y=False)
self.rel_atten = Biaffine(
rel_hidden_size, rel_hidden_size, num_labels, bias_x=True, bias_y=True
)
self.rel_atten = Biaffine(rel_hidden_size, rel_hidden_size, num_labels, bias_x=True, bias_y=True)

def forward(self, hidden_states, attention_mask=None):
bs, seqlen = hidden_states.shape[:2]
Expand Down
19 changes: 2 additions & 17 deletions python/core/ltp_core/models/criterion/token.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,7 @@ def __init__(self, temperature_scheduler: Callable[[Tensor, Tensor], float]):
super().__init__()
self.temperature_scheduler = temperature_scheduler

def forward(
self,
result: TokenClassifierResult,
labels: Tensor,
targets: Tensor = None,
*args,
**kwargs
) -> Tensor:
def forward(self, result: TokenClassifierResult, labels: Tensor, targets: Tensor = None, *args, **kwargs) -> Tensor:
loss = super().forward(result, labels, **kwargs)

if targets is not None:
Expand Down Expand Up @@ -108,18 +101,10 @@ def __init__(self, temperature_scheduler: Callable[[Tensor, Tensor], float]):
super().__init__()
self.temperature_scheduler = temperature_scheduler

def forward(
self,
result: TokenClassifierResult,
labels: Tensor,
targets: Tensor = None,
*args,
**kwargs
) -> Tensor:
def forward(self, result: TokenClassifierResult, labels: Tensor, targets: Tensor = None, *args, **kwargs) -> Tensor:
loss = super().forward(result, labels, **kwargs)

if targets is not None:

crf = result.crf
logits = result.logits
num_tags = logits.shape[-1]
Expand Down
10 changes: 2 additions & 8 deletions python/core/ltp_core/models/lit_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,7 @@ def __init__(
metrics = instantiate(metrics)

# must use module dict
metrics = ModuleDict(
{task: MetricCollection(metric, prefix=f"{task}/") for task, metric in metrics.items()}
)
metrics = ModuleDict({task: MetricCollection(metric, prefix=f"{task}/") for task, metric in metrics.items()})
self.train_metrics = metrics
self.val_metrics = deepcopy(metrics)
self.test_metrics = deepcopy(metrics)
Expand Down Expand Up @@ -216,11 +214,7 @@ def num_training_steps(self) -> int:

limit_batches = self.trainer.limit_train_batches
batches = len(self.trainer.datamodule.train_dataloader())
batches = (
min(batches, limit_batches)
if isinstance(limit_batches, int)
else int(limit_batches * batches)
)
batches = min(batches, limit_batches) if isinstance(limit_batches, int) else int(limit_batches * batches)
num_devices = max(1, self.trainer.num_devices)
effective_accum = self.trainer.accumulate_grad_batches * num_devices
return (batches // effective_accum) * self.trainer.max_epochs
Expand Down
6 changes: 2 additions & 4 deletions python/core/ltp_core/models/metrics/token.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ def update(self, result: TokenClassifierResult, labels: Tensor, **kwargs) -> Non

labels = labels.cpu().numpy()
labels = [
[self.labels[tag] for tag, mask in zip(tags, masks) if mask]
for tags, masks in zip(labels, attention_mask)
[self.labels[tag] for tag, mask in zip(tags, masks) if mask] for tags, masks in zip(labels, attention_mask)
]

if crf is None:
Expand Down Expand Up @@ -109,8 +108,7 @@ def update(self, result: TokenClassifierResult, labels: Tensor, **kwargs) -> Non

labels = labels.cpu().numpy()
labels = [
[self.labels[tag] for tag, mask in zip(tags, masks) if mask]
for tags, masks in zip(labels, attention_mask)
[self.labels[tag] for tag, mask in zip(tags, masks) if mask] for tags, masks in zip(labels, attention_mask)
]

if crf is None:
Expand Down
Loading

0 comments on commit 491a5c1

Please sign in to comment.