Skip to content

Commit

Permalink
WIP - initial support of Keepeek/GWT
Browse files Browse the repository at this point in the history
  • Loading branch information
don-vip committed Jan 22, 2024
1 parent 5273d5a commit ee6e051
Show file tree
Hide file tree
Showing 11 changed files with 410 additions and 17 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
<dependency>
<groupId>org.apache.uima</groupId>
<artifactId>uimaj-core</artifactId>
<version>3.5.0</version>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
Expand Down
32 changes: 26 additions & 6 deletions src/main/java/com/github/donvip/glamscrap/GlamScrap.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpRequest.BodyPublishers;
import java.net.http.HttpResponse;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.file.Files;
Expand Down Expand Up @@ -48,6 +53,7 @@
import com.github.donvip.glamscrap.domain.Notice;
import com.github.donvip.glamscrap.institutions.paris.ParisArchivesGlamScrap;
import com.github.donvip.glamscrap.institutions.toulouse.ToulouseArchivesGlamScrap;
import com.github.donvip.glamscrap.institutions.toulouse.ToulousePhotothequeGlamScrap;
import com.github.donvip.glamscrap.uploadtools.Pattypan;
import com.github.donvip.glamscrap.uploadtools.UploadTool;
import com.github.donvip.glamscrap.wikidata.Author;
Expand Down Expand Up @@ -124,7 +130,7 @@ public final void close() throws IOException {
}

public static void usage() {
LOGGER.info("Usage: GlamScrap [paris|toulouse] scrap [<fonds>[,<fonds>]*] | check [<fonds>[,<fonds>]*] | download [<fonds>[,<fonds>]*] | pattypan [<fonds>] | gui");
LOGGER.info("Usage: GlamScrap [paris_archives|toulouse_archives|toulouse_photos] scrap [<fonds>[,<fonds>]*] | check [<fonds>[,<fonds>]*] | download [<fonds>[,<fonds>]*] | pattypan [<fonds>] | gui");
}

public static void main(String[] args) {
Expand Down Expand Up @@ -173,11 +179,12 @@ private void doUploadTool(String[] args, UploadTool tool) throws IOException {

public abstract String getInstitution();

private static GlamScrap buildApp(String city) {
switch (city) {
case "paris": return new ParisArchivesGlamScrap();
case "toulouse": return new ToulouseArchivesGlamScrap();
default: throw new IllegalArgumentException("Unsupported city: " + city);
private static GlamScrap buildApp(String institution) {
switch (institution) {
case "paris_archives": return new ParisArchivesGlamScrap();
case "toulouse_archives": return new ToulouseArchivesGlamScrap();
case "toulouse_photos": return new ToulousePhotothequeGlamScrap();
default: throw new IllegalArgumentException("Unsupported institution: " + institution);
}
}

Expand Down Expand Up @@ -375,6 +382,19 @@ protected final Document fetch(String doc) throws IOException {
return Jsoup.connect(getBaseUrl() + doc).get();
}

protected final String fetchPost(String doc, String body, String...headers) throws IOException, InterruptedException {
return fetchPost(HttpRequest.newBuilder()
.headers(headers)
.method("POST", BodyPublishers.ofString(body))
.uri(URI.create(getBaseUrl() + doc)).build());
}

protected final String fetchPost(HttpRequest request) throws IOException, InterruptedException {
try (HttpClient client = HttpClient.newHttpClient()) {
return client.send(request, HttpResponse.BodyHandlers.ofString()).body();
}
}

public abstract String getOtherFields(Notice n);

public abstract List<String> getCategories(Notice n);
Expand Down
126 changes: 126 additions & 0 deletions src/main/java/com/github/donvip/glamscrap/Gwt.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package com.github.donvip.glamscrap;

import static java.util.Objects.requireNonNull;

import java.net.URI;
import java.net.http.HttpRequest;
import java.net.http.HttpRequest.BodyPublishers;
import java.net.http.HttpRequest.Builder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;

import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;

import org.apache.commons.lang3.ArrayUtils;
import org.openjdk.nashorn.api.scripting.ScriptObjectMirror;

public class Gwt {

public static final String BOOLEAN = "java.lang.Boolean/476441737";
public static final String INTEGER = "java.lang.Integer/3438268394";
public static final String STRING = "java.lang.String/2004016611";

private static final ScriptEngine NASHORN = new ScriptEngineManager().getEngineByName("nashorn");

public record TypedValue(String declaredType, String runtimeType, String stringValue, int intValue, List<TypedValue> fieldsInAlphabeticalOrder) {
}

public record GwtResponse(int protocolVersion, int flags, List<String> strings, List<Object> values) {
}

public static HttpRequest request(String uri, String baseUrl, String strongNamePolicyFile, String permutation, String service, String method, List<TypedValue> argTypes) {
return requestBuilder(uri, baseUrl, strongNamePolicyFile, permutation, service, method, argTypes).build();
}

public static Builder requestBuilder(String uri, String baseUrl, String strongNamePolicyFile, String permutation, String service, String method, List<TypedValue> argTypes) {
return HttpRequest.newBuilder()
.headers("Content-Type", "text/x-gwt-rpc; charset=utf-8", "X-GWT-Module-Base", baseUrl, "X-GWT-Permutation", permutation)
.method("POST", BodyPublishers.ofString(requestPayload(baseUrl, strongNamePolicyFile, service, method, argTypes)))
.uri(URI.create(uri));
}

public static String requestPayload(String baseUrl, String strongNamePolicyFile, String service, String method, List<TypedValue> typedValues) {
List<String> strings = new ArrayList<>();
strings.add(requireNonNull(baseUrl));
strings.add(requireNonNull(strongNamePolicyFile));
strings.add(requireNonNull(service));
strings.add(requireNonNull(method));
for (TypedValue typedVal : typedValues) {
addStrings(strings, typedVal);
}

List<Integer> ints = new ArrayList<>();
ints.add(1); // baseUrl
ints.add(2); // strongNamePolicyFile
ints.add(3); // service
ints.add(4); // method
ints.add(typedValues.size()); // number of arguments
for (TypedValue typedVal : typedValues) {
ints.add(strings.indexOf(typedVal.declaredType) + 1);
}
for (TypedValue typedVal : typedValues) {
addValueIndice(strings, ints, typedVal);
}

return requestPayload(strings.toArray(new String[0]), ArrayUtils.toPrimitive(ints.toArray(new Integer[0])));
}

private static void addStrings(List<String> strings, TypedValue typedVal) {
if (typedVal.declaredType != null && !strings.contains(typedVal.declaredType)) {
strings.add(typedVal.declaredType);
}
if (typedVal.runtimeType != null && !strings.contains(typedVal.runtimeType)) {
strings.add(typedVal.runtimeType);
}
if (typedVal.stringValue != null && !strings.contains(typedVal.stringValue)) {
strings.add(typedVal.stringValue);
}
for (TypedValue field : typedVal.fieldsInAlphabeticalOrder) {
addStrings(strings, requireNonNull(field));
}
}

private static void addValueIndice(List<String> strings, List<Integer> ints, TypedValue typedVal) {
if (typedVal.runtimeType != null ) {
ints.add(strings.indexOf(typedVal.runtimeType) + 1);
}
if (typedVal.fieldsInAlphabeticalOrder.isEmpty()) {
if (typedVal.declaredType != null && typedVal.declaredType.startsWith("java.lang.String")) {
ints.add(typedVal.stringValue != null ? strings.indexOf(typedVal.stringValue) + 1 : 0);
} else if (typedVal.runtimeType != null && (typedVal.runtimeType.startsWith("java.lang.Integer") || typedVal.runtimeType.startsWith("java.lang.Boolean"))) {
ints.add(typedVal.intValue);
}
}
for (TypedValue field : typedVal.fieldsInAlphabeticalOrder) {
addValueIndice(strings, ints, field);
}
}

private static String requestPayload(String[] strings, int[] ints) {
return requestPayload(7, 0, strings, ints);
}

private static String requestPayload(int protocolVersion, int flags, String[] strings, int[] ints) {
return String.format("%d|%d|%d|%s|%s|", protocolVersion, flags, strings.length, String.join("|", strings),
Arrays.stream(ints).mapToObj(Integer::toString).collect(Collectors.joining("|")));
}

public static GwtResponse decodeResponse(String response) throws ScriptException {
if (!response.startsWith("//OK")) {
throw new IllegalArgumentException("Invalid response: " + response);
}
ScriptObjectMirror res = (ScriptObjectMirror) NASHORN.eval(response.substring(4));
List<Object> list = new ArrayList<>(res.values());
Collections.reverse(list);
return new GwtResponse(
(Integer) list.get(0),
(Integer) list.get(1),
((ScriptObjectMirror) list.get(2)).values().stream().map(x -> (String) x).toList(),
list.subList(3, list.size()));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public class ParisArchivesGlamScrap extends GlamScrap {
}

public ParisArchivesGlamScrap() {
super("paris");
super("paris_archives");
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public class ToulouseArchivesGlamScrap extends GlamScrap {
}

public ToulouseArchivesGlamScrap() {
super("toulouse");
super("toulouse_archives");
}

@Override
Expand Down Expand Up @@ -105,7 +105,7 @@ protected Notice searchNotice(Fonds f, int i, int j, boolean fetch) {
try {
Document desc = fetch(String.format("Web_VoirLaNotice/34_01/%s/ILUMP21411", cote.replace("/", "xzx")));
if (desc != null) {
n = ToulouseParser.parseNotice(desc, cote);
n = ToulouseArchivesParser.parseNotice(desc, cote);
if (n != null) {
session.beginTransaction();
f.getNotices().add(n);
Expand All @@ -129,11 +129,7 @@ protected Notice searchNotice(Fonds f, int i, int j, boolean fetch) {
@Override
protected Fonds createNewFonds(String cote) throws IOException {
Document doc = fetch(String.format("Web_FondsCClass%s/ILUMP31929", cote));
if (doc != null) {
return ToulouseParser.parseFonds(doc, cote);
} else {
return null;
}
return doc != null ? ToulouseArchivesParser.parseFonds(doc, cote) : null;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import com.github.donvip.glamscrap.domain.Fonds;
import com.github.donvip.glamscrap.domain.Notice;

class ToulouseParser extends Parser {
class ToulouseArchivesParser extends Parser {

private static final Logger LOGGER = LogManager.getLogger();

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package com.github.donvip.glamscrap.institutions.toulouse;

import java.util.List;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.github.donvip.glamscrap.Gwt.GwtResponse;
import com.github.donvip.glamscrap.Parser;
import com.github.donvip.glamscrap.domain.Fonds;

public class ToulousePhotosParser extends Parser {

private static final Logger LOGGER = LogManager.getLogger();

public static Fonds parseFonds(GwtResponse basket, List<GwtResponse> medias, String cote) {
if (basket != null && medias != null) {
final Fonds f = new Fonds(cote);
// 0. Search for title
//int idx = basket.indexOf("\"com.keepeek.kpk360.shared.transport.UserLightTransport/");
//idx = basket.lastIndexOf("\"", idx - 3);
//f.setTitle(basket.substring(idx + 1, basket.indexOf("\"", idx + 1)));
// 1. Search for expected number of notices (information always displayed)
try {
f.setExpectedNotices(Integer.valueOf("617")); // TODO
} catch (RuntimeException e) {
LOGGER.warn("Unable to fetch number of notices for {}", cote);
}
return f;
} else {
LOGGER.warn("Couldn't parse fonds for: {}", cote);
return null;
}
}
}
Loading

0 comments on commit ee6e051

Please sign in to comment.