Skip to content

Commit

Permalink
doc done
Browse files Browse the repository at this point in the history
  • Loading branch information
BeachWang committed Dec 17, 2024
1 parent 9b6652d commit fa306dc
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 33 deletions.
44 changes: 40 additions & 4 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,43 @@ process:
- clean_ip_mapper: # remove ip addresses from text.
- clean_links_mapper: # remove web links from text.
- clean_copyright_mapper: # remove copyright comments.
- dialog_intent_detection_mapper: # Mapper to generate user's intent labels in dialog.
api_model: 'gpt-4o' # API model name.
intent_candidates: null # The output intent candidates. Use the intent labels of the open domain if it is None.
max_round: 10 # The max num of round in the dialog to build the prompt.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # System prompt for the task.
query_template: null # Template for query part to build the input prompt.
response_template: null # Template for response part to build the input prompt.
candidate_template: null # Template for intent candidates to build the input prompt.
analysis_template: null # Template for analysis part to build the input prompt.
labels_template: null # Template for labels to build the input prompt.
analysis_pattern: null # Pattern to parse the return intent analysis.
labels_pattern: null # Pattern to parse the return intent labels.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- dialog_sentiment_detection_mapper: # Mapper to generate user's sentiment labels in dialog.
api_model: 'gpt-4o' # API model name.
max_round: 10 # The max num of round in the dialog to build the prompt.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # System prompt for the task.
query_template: null # Template for query part to build the input prompt.
response_template: null # Template for response part to build the input prompt.
analysis_template: null # Template for analysis part to build the input prompt.
labels_template: null # Template for labels part to build the input prompt.
analysis_pattern: null # Pattern to parse the return sentiment analysis.
labels_pattern: null # Pattern to parse the return sentiment labels.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- dialog_sentiment_intensity_mapper: # Mapper to predict user's sentiment intensity (from -5 to 5 in default prompt) in dialog.
api_model: 'gpt-4o' # API model name.
max_round: 10 # The max num of round in the dialog to build the prompt.
api_endpoint: null # URL endpoint for the API.
esponse_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # System prompt for the task.
query_template: null # Template for query part to build the input prompt.
response_template: null # Template for response part to build the input prompt.
Expand Down Expand Up @@ -292,12 +324,16 @@ process:
- python_lambda_mapper: # executing Python lambda function on data samples.
lambda_str: '' # A string representation of the lambda function to be executed on data samples. If empty, the identity function is used.
batched: False # A boolean indicating whether to process input data in batches.
- query_sentiment_detection_mapper: # Mapper to predict user's sentiment intensity label ('negative', 'neutral' and 'positive') in query.
hf_model: 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis' # Hugginface model ID to predict sentiment intensity.
- query_intent_detection_mapper: # Mapper to predict user's Intent label in query.
hf_model: 'bespin-global/klue-roberta-small-3i4k-intent-classification' # Hugginface model ID to predict intent label.
zh_to_en_hf_model: 'Helsinki-NLP/opus-mt-zh-en' # Translation model from Chinese to English. If not None, translate the query from Chinese to English.
model_params: {} # model param for hf_model.
zh_to_en_model_params: {} # model param for zh_to_hf_model.
- query_sentiment_detection_mapper: # Mapper to predict user's sentiment label ('negative', 'neutral' and 'positive') in query.
hf_model: 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis' # Hugginface model ID to predict sentiment label.
zh_to_en_hf_model: 'Helsinki-NLP/opus-mt-zh-en' # Translation model from Chinese to English. If not None, translate the query from Chinese to English.
model_params: {} # model param for hf_model.
zh_to_en_model_params: {} # model param for zh_to_hf_model.
label_to_intensity: null # Map the output labels to the intensities instead of the default mapper if not None.
- relation_identity_mapper: # identify relation between two entity in the text.
api_model: 'gpt-4o' # API model name.
source_entity: '孙悟空' # The source entity of the relation to be dentified.
Expand Down
21 changes: 3 additions & 18 deletions data_juicer/ops/mapper/query_intent_detection_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,30 @@
class QueryIntentDetectionMapper(Mapper):
"""
Mapper to predict user's Intent label in query. Input from query_key.
Output intensity label and corresponding score for the query, which is
Output intent label and corresponding score for the query, which is
store in 'intent.query_label' and 'intent.query_label_score' in
Data-Juicer meta field.
"""

_accelerator = 'cuda'
_batched_op = True

DEFAULT_LABEL_TO_INTENSITY = {}

def __init__(
self,
hf_model:
str = 'bespin-global/klue-roberta-small-3i4k-intent-classification', # noqa: E501 E131
zh_to_en_hf_model: Optional[str] = 'Helsinki-NLP/opus-mt-zh-en',
model_params: Dict = {},
zh_to_en_model_params: Dict = {},
*,
label_to_intensity: Dict = None,
**kwargs):
"""
Initialization method.
:param hf_model: Hugginface model ID to predict sentiment intensity.
:param hf_model: Hugginface model ID to predict intent label.
:param zh_to_en_hf_model: Translation model from Chinese to English.
If not None, translate the query from Chinese to English.
:param model_params: model param for hf_model.
:param zh_to_en_model_params: model param for zh_to_hf_model.
:param label_to_intensity: Map the output labels to the intensities
instead of the default mapper if not None.
:param kwargs: Extra keyword arguments.
"""
super().__init__(**kwargs)
Expand All @@ -63,11 +57,6 @@ def __init__(
else:
self.zh_to_en_model_key = None

if label_to_intensity is not None:
self.label_to_intensity = label_to_intensity
else:
self.label_to_intensity = self.DEFAULT_LABEL_TO_INTENSITY

def process_batched(self, samples, rank=None):
queries = samples[self.query_key]

Expand All @@ -79,11 +68,7 @@ def process_batched(self, samples, rank=None):

classifier, _ = get_model(self.model_key, rank, self.use_cuda())
results = classifier(queries)
intensities = [
self.label_to_intensity[r['label']]
if r['label'] in self.label_to_intensity else r['label']
for r in results
]
intensities = [r['label'] for r in results]
scores = [r['score'] for r in results]

if Fields.meta not in samples:
Expand Down
6 changes: 0 additions & 6 deletions data_juicer/ops/mapper/query_sentiment_detection_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,6 @@ class QuerySentimentDetectionMapper(Mapper):
_accelerator = 'cuda'
_batched_op = True

DEFAULT_LABEL_TO_INTENSITY = {
'negative': -1,
'neutral': 0,
'positive': 1,
}

def __init__(
self,
hf_model:
Expand Down
3 changes: 3 additions & 0 deletions data_juicer/utils/auto_install_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@
'video_tagging_from_frames_mapper': ['ram', 'torch'],
'text_entity_dependency_filter': ['spacy-pkuseg'],
'optimize_response_mapper': ['torch', 'transformers', 'vllm'],
'dialog_intent_detection_mapper': ['openai'],
'dialog_sentiment_detection_mapper': ['openai'],
'dialog_sentiment_intensity_mapper': ['openai'],
'query_intent_detection_mapper': ['transformers'],
'query_sentiment_detection_mapper': ['transformers'],
}
Loading

0 comments on commit fa306dc

Please sign in to comment.