v2: edit cases

moevm · Sep 30, 2024 · 5ecde02 · 5ecde02
1 parent 88f199c
commit 5ecde02
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 13 deletions.
diff --git a/app/db/db_methods.py b/app/db/db_methods.py
@@ -36,6 +36,7 @@ def save_image_to_db(check_id, image_data, caption):
         'caption': caption
     })
     images_collection.insert_one(image.pack())
+    print(str(check_id) + " " + str(caption))
 
 
 # Returns user if user was created and None if already exists

diff --git a/app/main/presentations/pptx/presentation_pptx.py b/app/main/presentations/pptx/presentation_pptx.py
@@ -30,16 +30,15 @@ def extract_images_with_captions(self, check_id):
             image_data = None
             caption_text = None
 
-            # Проход по всем шейпам на слайде
+            # Проход по всем фигурам на слайде
             for shape in slide.slide.shapes:  # Используем slide.slide для доступа к текущему слайду
-                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:  # Тип 13 соответствует PICTURE
+                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                     image_found = True
                     image_part = shape.image  # Получаем объект изображения
 
                     # Извлекаем бинарные данные изображения
                     image_stream = image_part.blob
                     image_data = BytesIO(image_stream)
-                    print(f"Изображение найдено на слайде {slide.index}")
 
                 # Если мы нашли изображение, ищем следующий непустой текст как подпись
                 if image_found:
@@ -51,7 +50,6 @@ def extract_images_with_captions(self, check_id):
                             caption_text = text
                             # Сохраняем изображение и его подпись
                             save_image_to_db(check_id, image_data.getvalue(), caption_text)
-                            print(f"Подпись найдена: '{caption_text}' на слайде {slide.index}")
                             break  # Предполагаем, что это подпись к текущему изображению
 
                     # Сброс флага и данных изображения для следующего цикла

diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
@@ -252,8 +252,7 @@ def extract_images_with_captions(self, check_id):
         for i, paragraph in enumerate(self.file.paragraphs):
             # Проверяем, есть ли в параграфе встроенные объекты
             for run in paragraph.runs:
-                if "graphic" in run._element.xml:  # Это может быть изображение
-                    image_found = True
+                if "graphic" in run._element.xml:  # может быть изображение
 
                     # Извлечение бинарных данных изображения
                     image_streams = run._element.findall('.//a:blip', namespaces={
@@ -262,19 +261,36 @@ def extract_images_with_captions(self, check_id):
                         embed_id = image_stream.get(
                             '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
                         if embed_id:
+                            image_found = True
                             image_data = self.file.part.related_parts[embed_id].blob
 
                 # Если мы уже нашли изображение, ищем следующий непустой параграф для подписи
                 if image_found:
                     # Переход к следующему параграфу
                     next_paragraph_index = i + 1
-                    while next_paragraph_index < len(self.file.paragraphs):
-                        next_paragraph_text = self.file.paragraphs[next_paragraph_index].text.strip()
-                        if next_paragraph_text:  # Находим непустой параграф
-                            # Сохраняем изображение и его подпись
-                            save_image_to_db(check_id, image_data, next_paragraph_text)
-                            break
-                        next_paragraph_index += 1
+
+                    # Проверяем, есть ли следующий параграф
+                    if next_paragraph_index < len(self.file.paragraphs):
+                        while next_paragraph_index < len(self.file.paragraphs):
+                            next_paragraph = self.file.paragraphs[next_paragraph_index]
+                            next_paragraph_text = next_paragraph.text.strip()
+
+                            # Проверка, не содержит ли следующий параграф также изображение
+                            contains_image = any(
+                                "graphic" in run._element.xml for run in next_paragraph.runs
+                            )
+
+                            # Если параграф не содержит изображения и текст не пуст, то это подпись
+                            if not contains_image and next_paragraph_text:
+                                # Сохраняем изображение и его подпись
+                                save_image_to_db(check_id, image_data, next_paragraph_text)
+                                break
+                            else:
+                                save_image_to_db(check_id, image_data, "picture without caption")
+                                break
+                    else:
+                        save_image_to_db(check_id, image_data, "picture without caption")
+
                     image_found = False  # Сброс флага, чтобы искать следующее изображение
                     image_data = None  # Очистка данных изображения