Skip to content

Commit

Permalink
!9 merlin搜索数据导入,单词拼写错误修改
Browse files Browse the repository at this point in the history
Merge pull request !9 from 陈阳/master
  • Loading branch information
JoannaNil authored and gitee-org committed Apr 11, 2024
2 parents a4318e8 + 6a4a62e commit 1be6ed4
Show file tree
Hide file tree
Showing 16 changed files with 574 additions and 19 deletions.
10 changes: 4 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ ENV LANG="C.UTF-8"

ARG COMMUNITY=openeuler

ARG TOKEN

ENV TOKEN=${TOKEN}

WORKDIR /

RUN apt update \
Expand Down Expand Up @@ -54,12 +58,6 @@ ENV JAVA_HOME=${WORKSPACE}/jre
ENV PATH=${JAVA_HOME}/bin:$PATH
ENV MAPPING_PATH=${WORKSPACE}/target/classes/mapping.json

ARG GITEE_USER
ARG GITEE_PASSWORD

ENV GITEE_USER=${GITEE_USER}
ENV GITEE_PASSWORD=${GITEE_PASSWORD}

USER easysearch

CMD java -jar ${WORKSPACE}/target/import.jar
Expand Down
1 change: 0 additions & 1 deletion es-client/src/main/java/EsClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ public class EsClient {
* 创建客户端的类,定义create函数用于创建客户端。
*/
public static RestHighLevelClient create(List<String> host, int port, String protocol, int connectTimeout, int connectionRequestTimeout, int socketTimeout, String username, String password) throws IOException {
// return new RestHighLevelClient(RestClient.builder(new HttpHost("192.168.1.203", 9200, "http")));
final CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
credentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(username, password));
SSLContext sc = null;
Expand Down
131 changes: 131 additions & 0 deletions merlin/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.search</groupId>
<artifactId>EaseSearch-data-import-merlin</artifactId>
<version>1.0-SNAPSHOT</version>

<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
</properties>

<dependencies>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.19</version>
</dependency>

<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.13.2</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.7</version>
</dependency>

<dependency>
<groupId>org.commonmark</groupId>
<artifactId>commonmark</artifactId>
<version>0.21.0</version>
</dependency>

<dependency>
<groupId>org.commonmark</groupId>
<artifactId>commonmark-ext-gfm-tables</artifactId>
<version>0.21.0</version>
</dependency>

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.3</version>
</dependency>

<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>7.17.5</version>
</dependency>

<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>elasticsearch-rest-high-level-client</artifactId>
<version>7.17.5</version>
</dependency>

<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>2.0.7</version>
</dependency>

<dependency>
<groupId>org.yaml</groupId>
<artifactId>snakeyaml</artifactId>
<version>1.32</version>
</dependency>

<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.21.1</version>
</dependency>

<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.21.1</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.21</version>
</dependency>
<dependency>
<groupId>com.search</groupId>
<artifactId>es-client</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>

</dependencies>

<build>
<finalName>import</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.5.0</version>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass>App</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

</project>
80 changes: 80 additions & 0 deletions merlin/src/main/java/App.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import java.io.File;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class App {

private static final String TARGET = System.getenv("TARGET");

private static final String APPLICATION_PATH = System.getenv("APPLICATION_PATH");

private static final String MAPPING_PATH = System.getenv("MAPPING_PATH");

private static final String INDEX_PREFIX = "merlin_articles";

private static final Logger logger = LoggerFactory.getLogger(App.class);


public static void main(String[] args) {
try {
PublicClient.CreateClientFormConfig(APPLICATION_PATH);
PublicClient.makeIndex(INDEX_PREFIX + "_zh", MAPPING_PATH);
PublicClient.makeIndex(INDEX_PREFIX + "_en", MAPPING_PATH);
fileDate();
} catch (Exception e) {
logger.error(e.getMessage());
logger.error(e.toString());
}

logger.info("import end");
System.exit(0);
}


public static void fileDate() {
File indexFile = new File(TARGET);
if (!indexFile.exists()) {
logger.error("folder does not exist: ", indexFile.getPath());
return;
}

logger.info("begin to update document");

Set<String> idSet = new HashSet<>();

Collection<File> listFiles = FileUtils.listFiles(indexFile, new String[]{"md", "html"}, true);

for (File paresFile : listFiles) {
if (!paresFile.getName().startsWith("_")) {
try {
Map<String, Object> escape = Parse.parse(paresFile);
if (null != escape) {
PublicClient.insert(escape, INDEX_PREFIX + "_" + escape.get("lang"));
idSet.add((String) escape.get("path"));
} else {
System.out.println("parse null : " + paresFile.getPath());
}
} catch (Exception e) {
logger.info(paresFile.getPath());
logger.info(e.getMessage());
}
}
}
logger.info("start delete expired document");
PublicClient.deleteExpired(idSet, INDEX_PREFIX + "_*");

}

}






147 changes: 147 additions & 0 deletions merlin/src/main/java/Parse.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;
import org.commonmark.node.Node;
import org.commonmark.parser.Parser;
import org.commonmark.renderer.html.HtmlRenderer;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.yaml.snakeyaml.Yaml;

public class Parse {

public static final String DOCS = "docs";

public static final String BASEPATH = System.getenv("TARGET") + "/";

public static Map<String, Object> parse(File file) throws Exception {
String originalPath = file.getPath();
String fileName = file.getName();
String path = originalPath
.replace("\\", "/")
.replace(BASEPATH, "")
.replace("\\\\", "/")
.replace(".md", "")
.replace(".html", "");

String type = path.substring(0, path.indexOf("/"));

String lang = path.substring(type.length() + 1, path.indexOf("/", type.length() + 1));

Map<String, Object> jsonMap = new HashMap<>();
jsonMap.put("lang", lang);
jsonMap.put("type", type);
jsonMap.put("articleName", fileName);
jsonMap.put("path", path);

String fileContent = FileUtils.readFileToString(file, StandardCharsets.UTF_8);

if (type.equals(DOCS)) {
parseDocsType(jsonMap, fileContent, fileName);

int pelen = lang.length() + type.length() + 2;

String docsType = path.substring(pelen, path.indexOf("/", pelen));
jsonMap.put("docsType", docsType);
}

return jsonMap;
}



public static void parseDocsType(Map<String, Object> jsonMap, String fileContent, String fileName) {
String r = "";
if (fileContent.contains("---")) {
fileContent = fileContent.substring(fileContent.indexOf("---") + 3);
if (fileContent.contains("---")) {
r = fileContent.substring(0, fileContent.indexOf("---"));
fileContent = fileContent.substring(fileContent.indexOf("---") + 3);
}
}

Parser parser = Parser.builder().build();
HtmlRenderer renderer = HtmlRenderer.builder().build();
Node document = parser.parse(fileContent);
Document node = Jsoup.parse(renderer.render(document));

if (node.getElementsByTag("h1").size() > 0) {
jsonMap.put("title", node.getElementsByTag("h1").first().text());
} else {
jsonMap.put("title", fileName);
}

jsonMap.put("textContent", node.text());

Elements h1 = node.getElementsByTag("h1");
Elements h2 = node.getElementsByTag("h2");
Elements h3 = node.getElementsByTag("h3");
Elements h4 = node.getElementsByTag("h4");
Elements h5 = node.getElementsByTag("h5");
Elements strong = node.getElementsByTag("strong");

jsonMap.put("h1", h1.text());
jsonMap.put("h2", h2.text());
jsonMap.put("h3", h3.text());
jsonMap.put("h4", h4.text());
jsonMap.put("h5", h5.text());
jsonMap.put("strong", strong.text());

Yaml yaml = new Yaml();
Map<String, Object> ret = yaml.load(r);
String key = "";
Object value = "";

for (Map.Entry<String, Object> entry : ret.entrySet()) {
key = entry.getKey().toLowerCase(Locale.ROOT);
value = entry.getValue();
if (key.equals("date")) {
//需要处理日期不标准导致的存入ES失败的问题。
String dateString = "";
if (value.getClass().getSimpleName().equals("Date")) {
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
dateString = format.format(value);
} else {
dateString = value.toString();
}
Pattern pattern = Pattern.compile("\\D"); //匹配所有非数字
Matcher matcher = pattern.matcher(dateString);
dateString = matcher.replaceAll("-");
if (dateString.length() < 10) {
StringBuilder stringBuilder = new StringBuilder(dateString);
if (stringBuilder.charAt(7) != '-') {
stringBuilder.insert(5, "0");
}
if (stringBuilder.length() < 10) {
stringBuilder.insert(8, "0");
}
dateString = stringBuilder.toString();
}
value = dateString;
}
if (key.equals("author") && value instanceof String) {
value = new String[]{value.toString()};
}
if (key.equals("head")) {
continue;
}
if (key.equals("title")) {
key = "specify";
}
jsonMap.put(key, value);
}
if (jsonMap.containsKey("date")) {
jsonMap.put("archives", jsonMap.get("date").toString().substring(0, 7));
}

}

}
Loading

0 comments on commit 1be6ed4

Please sign in to comment.