doc done

modelscope · Dec 17, 2024 · fa306dc · fa306dc
1 parent 9b6652d
commit fa306dc
Show file tree

Hide file tree

Showing 7 changed files with 57 additions and 33 deletions.
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -77,11 +77,43 @@ process:
   - clean_ip_mapper:                                        # remove ip addresses from text.
   - clean_links_mapper:                                     # remove web links from text.
   - clean_copyright_mapper:                                 # remove copyright comments.
+  - dialog_intent_detection_mapper:                         # Mapper to generate user's intent labels in dialog.
+      api_model: 'gpt-4o'                                     # API model name.
+      intent_candidates: null                                 # The output intent candidates. Use the intent labels of the open domain if it is None.
+      max_round: 10                                           # The max num of round in the dialog to build the prompt.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt: null                                     # System prompt for the task.
+      query_template: null                                    # Template for query part to build the input prompt.
+      response_template: null                                 # Template for response part to build the input prompt.
+      candidate_template: null                                # Template for intent candidates to build the input prompt.
+      analysis_template: null                                 # Template for analysis part to build the input prompt.
+      labels_template: null                                   # Template for labels to build the input prompt.
+      analysis_pattern: null                                  # Pattern to parse the return intent analysis.
+      labels_pattern: null                                    # Pattern to parse the return intent labels.
+      try_num: 3                                              # The number of retry attempts when there is an API call error or output parsing error.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
+  - dialog_sentiment_detection_mapper:                      # Mapper to generate user's sentiment labels in dialog.
+      api_model: 'gpt-4o'                                     # API model name.
+      max_round: 10                                           # The max num of round in the dialog to build the prompt.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt: null                                     # System prompt for the task.
+      query_template: null                                    # Template for query part to build the input prompt.
+      response_template: null                                 # Template for response part to build the input prompt.
+      analysis_template: null                                 # Template for analysis part to build the input prompt.
+      labels_template: null                                   # Template for labels part to build the input prompt.
+      analysis_pattern: null                                  # Pattern to parse the return sentiment analysis.
+      labels_pattern: null                                    # Pattern to parse the return sentiment labels.
+      try_num: 3                                              # The number of retry attempts when there is an API call error or output parsing error.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
   - dialog_sentiment_intensity_mapper:                      # Mapper to predict user's sentiment intensity (from -5 to 5 in default prompt) in dialog.
       api_model: 'gpt-4o'                                     # API model name.
       max_round: 10                                           # The max num of round in the dialog to build the prompt.
       api_endpoint: null                                      # URL endpoint for the API.
-      esponse_path: null                                      # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
       system_prompt: null                                     # System prompt for the task.
       query_template: null                                    # Template for query part to build the input prompt.
       response_template: null                                 # Template for response part to build the input prompt.
@@ -292,12 +324,16 @@ process:
   - python_lambda_mapper:                                   # executing Python lambda function on data samples.
       lambda_str: ''                                          # A string representation of the lambda function to be executed on data samples. If empty, the identity function is used.
       batched: False                                          # A boolean indicating whether to process input data in batches.
-  - query_sentiment_detection_mapper:                       # Mapper to predict user's sentiment intensity label ('negative', 'neutral' and 'positive') in query.
-      hf_model: 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis'     # Hugginface model ID to predict sentiment intensity.
+  - query_intent_detection_mapper:                          # Mapper to predict user's Intent label in query.
+      hf_model: 'bespin-global/klue-roberta-small-3i4k-intent-classification'     # Hugginface model ID to predict intent label.
+      zh_to_en_hf_model: 'Helsinki-NLP/opus-mt-zh-en'         # Translation model from Chinese to English. If not None, translate the query from Chinese to English.
+      model_params: {}                                        # model param for hf_model.
+      zh_to_en_model_params: {}                               # model param for zh_to_hf_model.
+  - query_sentiment_detection_mapper:                       # Mapper to predict user's sentiment label ('negative', 'neutral' and 'positive') in query.
+      hf_model: 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis'     # Hugginface model ID to predict sentiment label.
       zh_to_en_hf_model: 'Helsinki-NLP/opus-mt-zh-en'         # Translation model from Chinese to English. If not None, translate the query from Chinese to English.
       model_params: {}                                        # model param for hf_model.
       zh_to_en_model_params: {}                               # model param for zh_to_hf_model.
-      label_to_intensity: null                                # Map the output labels to the intensities instead of the default mapper if not None.
   - relation_identity_mapper:                               # identify relation between two entity in the text.
       api_model: 'gpt-4o'                                     # API model name.
       source_entity: '孙悟空'                                  # The source entity of the relation to be dentified.

diff --git a/data_juicer/ops/mapper/query_intent_detection_mapper.py b/data_juicer/ops/mapper/query_intent_detection_mapper.py
@@ -13,36 +13,30 @@
 class QueryIntentDetectionMapper(Mapper):
     """
     Mapper to predict user's Intent label in query. Input from query_key.
-    Output intensity label and corresponding score for the query, which is
+    Output intent label and corresponding score for the query, which is
     store in 'intent.query_label' and 'intent.query_label_score' in
     Data-Juicer meta field.
     """
 
     _accelerator = 'cuda'
     _batched_op = True
 
-    DEFAULT_LABEL_TO_INTENSITY = {}
-
     def __init__(
             self,
             hf_model:
         str = 'bespin-global/klue-roberta-small-3i4k-intent-classification',  # noqa: E501 E131
             zh_to_en_hf_model: Optional[str] = 'Helsinki-NLP/opus-mt-zh-en',
             model_params: Dict = {},
             zh_to_en_model_params: Dict = {},
-            *,
-            label_to_intensity: Dict = None,
             **kwargs):
         """
         Initialization method.
 
-        :param hf_model: Hugginface model ID to predict sentiment intensity.
+        :param hf_model: Hugginface model ID to predict intent label.
         :param zh_to_en_hf_model: Translation model from Chinese to English.
             If not None, translate the query from Chinese to English.
         :param model_params: model param for hf_model.
         :param zh_to_en_model_params: model param for zh_to_hf_model.
-        :param label_to_intensity: Map the output labels to the intensities
-            instead of the default mapper if not None.
         :param kwargs: Extra keyword arguments.
         """
         super().__init__(**kwargs)
@@ -63,11 +57,6 @@ def __init__(
         else:
             self.zh_to_en_model_key = None
 
-        if label_to_intensity is not None:
-            self.label_to_intensity = label_to_intensity
-        else:
-            self.label_to_intensity = self.DEFAULT_LABEL_TO_INTENSITY
-
     def process_batched(self, samples, rank=None):
         queries = samples[self.query_key]
 
@@ -79,11 +68,7 @@ def process_batched(self, samples, rank=None):
 
         classifier, _ = get_model(self.model_key, rank, self.use_cuda())
         results = classifier(queries)
-        intensities = [
-            self.label_to_intensity[r['label']]
-            if r['label'] in self.label_to_intensity else r['label']
-            for r in results
-        ]
+        intensities = [r['label'] for r in results]
         scores = [r['score'] for r in results]
 
         if Fields.meta not in samples:

diff --git a/data_juicer/ops/mapper/query_sentiment_detection_mapper.py b/data_juicer/ops/mapper/query_sentiment_detection_mapper.py
@@ -22,12 +22,6 @@ class QuerySentimentDetectionMapper(Mapper):
     _accelerator = 'cuda'
     _batched_op = True
 
-    DEFAULT_LABEL_TO_INTENSITY = {
-        'negative': -1,
-        'neutral': 0,
-        'positive': 1,
-    }
-
     def __init__(
             self,
             hf_model:

diff --git a/data_juicer/utils/auto_install_mapping.py b/data_juicer/utils/auto_install_mapping.py
@@ -80,6 +80,9 @@
     'video_tagging_from_frames_mapper': ['ram', 'torch'],
     'text_entity_dependency_filter': ['spacy-pkuseg'],
     'optimize_response_mapper': ['torch', 'transformers', 'vllm'],
+    'dialog_intent_detection_mapper': ['openai'],
+    'dialog_sentiment_detection_mapper': ['openai'],
     'dialog_sentiment_intensity_mapper': ['openai'],
+    'query_intent_detection_mapper': ['transformers'],
     'query_sentiment_detection_mapper': ['transformers'],
 }