-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request !9 from 陈阳/master
- Loading branch information
Showing
16 changed files
with
574 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>com.search</groupId> | ||
<artifactId>EaseSearch-data-import-merlin</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
|
||
<properties> | ||
<maven.compiler.source>17</maven.compiler.source> | ||
<maven.compiler.target>17</maven.compiler.target> | ||
</properties> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>org.apache.pdfbox</groupId> | ||
<artifactId>pdfbox</artifactId> | ||
<version>2.0.19</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>com.itextpdf</groupId> | ||
<artifactId>itextpdf</artifactId> | ||
<version>5.5.13.2</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>commons-io</groupId> | ||
<artifactId>commons-io</artifactId> | ||
<version>2.7</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.commonmark</groupId> | ||
<artifactId>commonmark</artifactId> | ||
<version>0.21.0</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.commonmark</groupId> | ||
<artifactId>commonmark-ext-gfm-tables</artifactId> | ||
<version>0.21.0</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.jsoup</groupId> | ||
<artifactId>jsoup</artifactId> | ||
<version>1.15.3</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.elasticsearch</groupId> | ||
<artifactId>elasticsearch</artifactId> | ||
<version>7.17.5</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.elasticsearch.client</groupId> | ||
<artifactId>elasticsearch-rest-high-level-client</artifactId> | ||
<version>7.17.5</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>com.alibaba</groupId> | ||
<artifactId>fastjson</artifactId> | ||
<version>2.0.7</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.yaml</groupId> | ||
<artifactId>snakeyaml</artifactId> | ||
<version>1.32</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.apache.logging.log4j</groupId> | ||
<artifactId>log4j-core</artifactId> | ||
<version>2.21.1</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.apache.logging.log4j</groupId> | ||
<artifactId>log4j-api</artifactId> | ||
<version>2.21.1</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.slf4j</groupId> | ||
<artifactId>slf4j-log4j12</artifactId> | ||
<version>1.7.21</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.search</groupId> | ||
<artifactId>es-client</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
</dependency> | ||
|
||
</dependencies> | ||
|
||
<build> | ||
<finalName>import</finalName> | ||
<plugins> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-assembly-plugin</artifactId> | ||
<version>3.5.0</version> | ||
<configuration> | ||
<appendAssemblyId>false</appendAssemblyId> | ||
<descriptorRefs> | ||
<descriptorRef>jar-with-dependencies</descriptorRef> | ||
</descriptorRefs> | ||
<archive> | ||
<manifest> | ||
<mainClass>App</mainClass> | ||
</manifest> | ||
</archive> | ||
</configuration> | ||
<executions> | ||
<execution> | ||
<id>make-assembly</id> | ||
<phase>package</phase> | ||
<goals> | ||
<goal>single</goal> | ||
</goals> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
|
||
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import java.io.File; | ||
import java.util.Collection; | ||
import java.util.HashSet; | ||
import java.util.Map; | ||
import java.util.Set; | ||
|
||
import org.apache.commons.io.FileUtils; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
public class App { | ||
|
||
private static final String TARGET = System.getenv("TARGET"); | ||
|
||
private static final String APPLICATION_PATH = System.getenv("APPLICATION_PATH"); | ||
|
||
private static final String MAPPING_PATH = System.getenv("MAPPING_PATH"); | ||
|
||
private static final String INDEX_PREFIX = "merlin_articles"; | ||
|
||
private static final Logger logger = LoggerFactory.getLogger(App.class); | ||
|
||
|
||
public static void main(String[] args) { | ||
try { | ||
PublicClient.CreateClientFormConfig(APPLICATION_PATH); | ||
PublicClient.makeIndex(INDEX_PREFIX + "_zh", MAPPING_PATH); | ||
PublicClient.makeIndex(INDEX_PREFIX + "_en", MAPPING_PATH); | ||
fileDate(); | ||
} catch (Exception e) { | ||
logger.error(e.getMessage()); | ||
logger.error(e.toString()); | ||
} | ||
|
||
logger.info("import end"); | ||
System.exit(0); | ||
} | ||
|
||
|
||
public static void fileDate() { | ||
File indexFile = new File(TARGET); | ||
if (!indexFile.exists()) { | ||
logger.error("folder does not exist: ", indexFile.getPath()); | ||
return; | ||
} | ||
|
||
logger.info("begin to update document"); | ||
|
||
Set<String> idSet = new HashSet<>(); | ||
|
||
Collection<File> listFiles = FileUtils.listFiles(indexFile, new String[]{"md", "html"}, true); | ||
|
||
for (File paresFile : listFiles) { | ||
if (!paresFile.getName().startsWith("_")) { | ||
try { | ||
Map<String, Object> escape = Parse.parse(paresFile); | ||
if (null != escape) { | ||
PublicClient.insert(escape, INDEX_PREFIX + "_" + escape.get("lang")); | ||
idSet.add((String) escape.get("path")); | ||
} else { | ||
System.out.println("parse null : " + paresFile.getPath()); | ||
} | ||
} catch (Exception e) { | ||
logger.info(paresFile.getPath()); | ||
logger.info(e.getMessage()); | ||
} | ||
} | ||
} | ||
logger.info("start delete expired document"); | ||
PublicClient.deleteExpired(idSet, INDEX_PREFIX + "_*"); | ||
|
||
} | ||
|
||
} | ||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
import java.io.File; | ||
import java.nio.charset.StandardCharsets; | ||
import java.text.SimpleDateFormat; | ||
import java.util.HashMap; | ||
import java.util.Locale; | ||
import java.util.Map; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
import org.apache.commons.io.FileUtils; | ||
import org.commonmark.node.Node; | ||
import org.commonmark.parser.Parser; | ||
import org.commonmark.renderer.html.HtmlRenderer; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.select.Elements; | ||
import org.yaml.snakeyaml.Yaml; | ||
|
||
public class Parse { | ||
|
||
public static final String DOCS = "docs"; | ||
|
||
public static final String BASEPATH = System.getenv("TARGET") + "/"; | ||
|
||
public static Map<String, Object> parse(File file) throws Exception { | ||
String originalPath = file.getPath(); | ||
String fileName = file.getName(); | ||
String path = originalPath | ||
.replace("\\", "/") | ||
.replace(BASEPATH, "") | ||
.replace("\\\\", "/") | ||
.replace(".md", "") | ||
.replace(".html", ""); | ||
|
||
String type = path.substring(0, path.indexOf("/")); | ||
|
||
String lang = path.substring(type.length() + 1, path.indexOf("/", type.length() + 1)); | ||
|
||
Map<String, Object> jsonMap = new HashMap<>(); | ||
jsonMap.put("lang", lang); | ||
jsonMap.put("type", type); | ||
jsonMap.put("articleName", fileName); | ||
jsonMap.put("path", path); | ||
|
||
String fileContent = FileUtils.readFileToString(file, StandardCharsets.UTF_8); | ||
|
||
if (type.equals(DOCS)) { | ||
parseDocsType(jsonMap, fileContent, fileName); | ||
|
||
int pelen = lang.length() + type.length() + 2; | ||
|
||
String docsType = path.substring(pelen, path.indexOf("/", pelen)); | ||
jsonMap.put("docsType", docsType); | ||
} | ||
|
||
return jsonMap; | ||
} | ||
|
||
|
||
|
||
public static void parseDocsType(Map<String, Object> jsonMap, String fileContent, String fileName) { | ||
String r = ""; | ||
if (fileContent.contains("---")) { | ||
fileContent = fileContent.substring(fileContent.indexOf("---") + 3); | ||
if (fileContent.contains("---")) { | ||
r = fileContent.substring(0, fileContent.indexOf("---")); | ||
fileContent = fileContent.substring(fileContent.indexOf("---") + 3); | ||
} | ||
} | ||
|
||
Parser parser = Parser.builder().build(); | ||
HtmlRenderer renderer = HtmlRenderer.builder().build(); | ||
Node document = parser.parse(fileContent); | ||
Document node = Jsoup.parse(renderer.render(document)); | ||
|
||
if (node.getElementsByTag("h1").size() > 0) { | ||
jsonMap.put("title", node.getElementsByTag("h1").first().text()); | ||
} else { | ||
jsonMap.put("title", fileName); | ||
} | ||
|
||
jsonMap.put("textContent", node.text()); | ||
|
||
Elements h1 = node.getElementsByTag("h1"); | ||
Elements h2 = node.getElementsByTag("h2"); | ||
Elements h3 = node.getElementsByTag("h3"); | ||
Elements h4 = node.getElementsByTag("h4"); | ||
Elements h5 = node.getElementsByTag("h5"); | ||
Elements strong = node.getElementsByTag("strong"); | ||
|
||
jsonMap.put("h1", h1.text()); | ||
jsonMap.put("h2", h2.text()); | ||
jsonMap.put("h3", h3.text()); | ||
jsonMap.put("h4", h4.text()); | ||
jsonMap.put("h5", h5.text()); | ||
jsonMap.put("strong", strong.text()); | ||
|
||
Yaml yaml = new Yaml(); | ||
Map<String, Object> ret = yaml.load(r); | ||
String key = ""; | ||
Object value = ""; | ||
|
||
for (Map.Entry<String, Object> entry : ret.entrySet()) { | ||
key = entry.getKey().toLowerCase(Locale.ROOT); | ||
value = entry.getValue(); | ||
if (key.equals("date")) { | ||
//需要处理日期不标准导致的存入ES失败的问题。 | ||
String dateString = ""; | ||
if (value.getClass().getSimpleName().equals("Date")) { | ||
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); | ||
dateString = format.format(value); | ||
} else { | ||
dateString = value.toString(); | ||
} | ||
Pattern pattern = Pattern.compile("\\D"); //匹配所有非数字 | ||
Matcher matcher = pattern.matcher(dateString); | ||
dateString = matcher.replaceAll("-"); | ||
if (dateString.length() < 10) { | ||
StringBuilder stringBuilder = new StringBuilder(dateString); | ||
if (stringBuilder.charAt(7) != '-') { | ||
stringBuilder.insert(5, "0"); | ||
} | ||
if (stringBuilder.length() < 10) { | ||
stringBuilder.insert(8, "0"); | ||
} | ||
dateString = stringBuilder.toString(); | ||
} | ||
value = dateString; | ||
} | ||
if (key.equals("author") && value instanceof String) { | ||
value = new String[]{value.toString()}; | ||
} | ||
if (key.equals("head")) { | ||
continue; | ||
} | ||
if (key.equals("title")) { | ||
key = "specify"; | ||
} | ||
jsonMap.put(key, value); | ||
} | ||
if (jsonMap.containsKey("date")) { | ||
jsonMap.put("archives", jsonMap.get("date").toString().substring(0, 7)); | ||
} | ||
|
||
} | ||
|
||
} |
Oops, something went wrong.