diff --git a/pom.xml b/pom.xml index caaf993..8b480ed 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,7 @@ org.apache.uima uimaj-core - 3.5.0 + 2.11.0 org.apache.poi diff --git a/src/main/java/com/github/donvip/glamscrap/GlamScrap.java b/src/main/java/com/github/donvip/glamscrap/GlamScrap.java index fec3a8f..b78f7da 100644 --- a/src/main/java/com/github/donvip/glamscrap/GlamScrap.java +++ b/src/main/java/com/github/donvip/glamscrap/GlamScrap.java @@ -19,6 +19,11 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpRequest.BodyPublishers; +import java.net.http.HttpResponse; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.nio.file.Files; @@ -48,6 +53,7 @@ import com.github.donvip.glamscrap.domain.Notice; import com.github.donvip.glamscrap.institutions.paris.ParisArchivesGlamScrap; import com.github.donvip.glamscrap.institutions.toulouse.ToulouseArchivesGlamScrap; +import com.github.donvip.glamscrap.institutions.toulouse.ToulousePhotothequeGlamScrap; import com.github.donvip.glamscrap.uploadtools.Pattypan; import com.github.donvip.glamscrap.uploadtools.UploadTool; import com.github.donvip.glamscrap.wikidata.Author; @@ -124,7 +130,7 @@ public final void close() throws IOException { } public static void usage() { - LOGGER.info("Usage: GlamScrap [paris|toulouse] scrap [[,]*] | check [[,]*] | download [[,]*] | pattypan [] | gui"); + LOGGER.info("Usage: GlamScrap [paris_archives|toulouse_archives|toulouse_photos] scrap [[,]*] | check [[,]*] | download [[,]*] | pattypan [] | gui"); } public static void main(String[] args) { @@ -173,11 +179,12 @@ private void doUploadTool(String[] args, UploadTool tool) throws IOException { public abstract String getInstitution(); - private static GlamScrap buildApp(String city) { - switch (city) { - case "paris": return new ParisArchivesGlamScrap(); - case "toulouse": return new ToulouseArchivesGlamScrap(); - default: throw new IllegalArgumentException("Unsupported city: " + city); + private static GlamScrap buildApp(String institution) { + switch (institution) { + case "paris_archives": return new ParisArchivesGlamScrap(); + case "toulouse_archives": return new ToulouseArchivesGlamScrap(); + case "toulouse_photos": return new ToulousePhotothequeGlamScrap(); + default: throw new IllegalArgumentException("Unsupported institution: " + institution); } } @@ -375,6 +382,19 @@ protected final Document fetch(String doc) throws IOException { return Jsoup.connect(getBaseUrl() + doc).get(); } + protected final String fetchPost(String doc, String body, String...headers) throws IOException, InterruptedException { + return fetchPost(HttpRequest.newBuilder() + .headers(headers) + .method("POST", BodyPublishers.ofString(body)) + .uri(URI.create(getBaseUrl() + doc)).build()); + } + + protected final String fetchPost(HttpRequest request) throws IOException, InterruptedException { + try (HttpClient client = HttpClient.newHttpClient()) { + return client.send(request, HttpResponse.BodyHandlers.ofString()).body(); + } + } + public abstract String getOtherFields(Notice n); public abstract List getCategories(Notice n); diff --git a/src/main/java/com/github/donvip/glamscrap/Gwt.java b/src/main/java/com/github/donvip/glamscrap/Gwt.java new file mode 100644 index 0000000..63615d6 --- /dev/null +++ b/src/main/java/com/github/donvip/glamscrap/Gwt.java @@ -0,0 +1,126 @@ +package com.github.donvip.glamscrap; + +import static java.util.Objects.requireNonNull; + +import java.net.URI; +import java.net.http.HttpRequest; +import java.net.http.HttpRequest.BodyPublishers; +import java.net.http.HttpRequest.Builder; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import javax.script.ScriptEngine; +import javax.script.ScriptEngineManager; +import javax.script.ScriptException; + +import org.apache.commons.lang3.ArrayUtils; +import org.openjdk.nashorn.api.scripting.ScriptObjectMirror; + +public class Gwt { + + public static final String BOOLEAN = "java.lang.Boolean/476441737"; + public static final String INTEGER = "java.lang.Integer/3438268394"; + public static final String STRING = "java.lang.String/2004016611"; + + private static final ScriptEngine NASHORN = new ScriptEngineManager().getEngineByName("nashorn"); + + public record TypedValue(String declaredType, String runtimeType, String stringValue, int intValue, List fieldsInAlphabeticalOrder) { + } + + public record GwtResponse(int protocolVersion, int flags, List strings, List values) { + } + + public static HttpRequest request(String uri, String baseUrl, String strongNamePolicyFile, String permutation, String service, String method, List argTypes) { + return requestBuilder(uri, baseUrl, strongNamePolicyFile, permutation, service, method, argTypes).build(); + } + + public static Builder requestBuilder(String uri, String baseUrl, String strongNamePolicyFile, String permutation, String service, String method, List argTypes) { + return HttpRequest.newBuilder() + .headers("Content-Type", "text/x-gwt-rpc; charset=utf-8", "X-GWT-Module-Base", baseUrl, "X-GWT-Permutation", permutation) + .method("POST", BodyPublishers.ofString(requestPayload(baseUrl, strongNamePolicyFile, service, method, argTypes))) + .uri(URI.create(uri)); + } + + public static String requestPayload(String baseUrl, String strongNamePolicyFile, String service, String method, List typedValues) { + List strings = new ArrayList<>(); + strings.add(requireNonNull(baseUrl)); + strings.add(requireNonNull(strongNamePolicyFile)); + strings.add(requireNonNull(service)); + strings.add(requireNonNull(method)); + for (TypedValue typedVal : typedValues) { + addStrings(strings, typedVal); + } + + List ints = new ArrayList<>(); + ints.add(1); // baseUrl + ints.add(2); // strongNamePolicyFile + ints.add(3); // service + ints.add(4); // method + ints.add(typedValues.size()); // number of arguments + for (TypedValue typedVal : typedValues) { + ints.add(strings.indexOf(typedVal.declaredType) + 1); + } + for (TypedValue typedVal : typedValues) { + addValueIndice(strings, ints, typedVal); + } + + return requestPayload(strings.toArray(new String[0]), ArrayUtils.toPrimitive(ints.toArray(new Integer[0]))); + } + + private static void addStrings(List strings, TypedValue typedVal) { + if (typedVal.declaredType != null && !strings.contains(typedVal.declaredType)) { + strings.add(typedVal.declaredType); + } + if (typedVal.runtimeType != null && !strings.contains(typedVal.runtimeType)) { + strings.add(typedVal.runtimeType); + } + if (typedVal.stringValue != null && !strings.contains(typedVal.stringValue)) { + strings.add(typedVal.stringValue); + } + for (TypedValue field : typedVal.fieldsInAlphabeticalOrder) { + addStrings(strings, requireNonNull(field)); + } + } + + private static void addValueIndice(List strings, List ints, TypedValue typedVal) { + if (typedVal.runtimeType != null ) { + ints.add(strings.indexOf(typedVal.runtimeType) + 1); + } + if (typedVal.fieldsInAlphabeticalOrder.isEmpty()) { + if (typedVal.declaredType != null && typedVal.declaredType.startsWith("java.lang.String")) { + ints.add(typedVal.stringValue != null ? strings.indexOf(typedVal.stringValue) + 1 : 0); + } else if (typedVal.runtimeType != null && (typedVal.runtimeType.startsWith("java.lang.Integer") || typedVal.runtimeType.startsWith("java.lang.Boolean"))) { + ints.add(typedVal.intValue); + } + } + for (TypedValue field : typedVal.fieldsInAlphabeticalOrder) { + addValueIndice(strings, ints, field); + } + } + + private static String requestPayload(String[] strings, int[] ints) { + return requestPayload(7, 0, strings, ints); + } + + private static String requestPayload(int protocolVersion, int flags, String[] strings, int[] ints) { + return String.format("%d|%d|%d|%s|%s|", protocolVersion, flags, strings.length, String.join("|", strings), + Arrays.stream(ints).mapToObj(Integer::toString).collect(Collectors.joining("|"))); + } + + public static GwtResponse decodeResponse(String response) throws ScriptException { + if (!response.startsWith("//OK")) { + throw new IllegalArgumentException("Invalid response: " + response); + } + ScriptObjectMirror res = (ScriptObjectMirror) NASHORN.eval(response.substring(4)); + List list = new ArrayList<>(res.values()); + Collections.reverse(list); + return new GwtResponse( + (Integer) list.get(0), + (Integer) list.get(1), + ((ScriptObjectMirror) list.get(2)).values().stream().map(x -> (String) x).toList(), + list.subList(3, list.size())); + } +} diff --git a/src/main/java/com/github/donvip/glamscrap/institutions/paris/ParisArchivesGlamScrap.java b/src/main/java/com/github/donvip/glamscrap/institutions/paris/ParisArchivesGlamScrap.java index 92479d6..4825f8a 100644 --- a/src/main/java/com/github/donvip/glamscrap/institutions/paris/ParisArchivesGlamScrap.java +++ b/src/main/java/com/github/donvip/glamscrap/institutions/paris/ParisArchivesGlamScrap.java @@ -55,7 +55,7 @@ public class ParisArchivesGlamScrap extends GlamScrap { } public ParisArchivesGlamScrap() { - super("paris"); + super("paris_archives"); } @Override diff --git a/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulouseArchivesGlamScrap.java b/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulouseArchivesGlamScrap.java index a0add52..afb038f 100644 --- a/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulouseArchivesGlamScrap.java +++ b/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulouseArchivesGlamScrap.java @@ -40,7 +40,7 @@ public class ToulouseArchivesGlamScrap extends GlamScrap { } public ToulouseArchivesGlamScrap() { - super("toulouse"); + super("toulouse_archives"); } @Override @@ -105,7 +105,7 @@ protected Notice searchNotice(Fonds f, int i, int j, boolean fetch) { try { Document desc = fetch(String.format("Web_VoirLaNotice/34_01/%s/ILUMP21411", cote.replace("/", "xzx"))); if (desc != null) { - n = ToulouseParser.parseNotice(desc, cote); + n = ToulouseArchivesParser.parseNotice(desc, cote); if (n != null) { session.beginTransaction(); f.getNotices().add(n); @@ -129,11 +129,7 @@ protected Notice searchNotice(Fonds f, int i, int j, boolean fetch) { @Override protected Fonds createNewFonds(String cote) throws IOException { Document doc = fetch(String.format("Web_FondsCClass%s/ILUMP31929", cote)); - if (doc != null) { - return ToulouseParser.parseFonds(doc, cote); - } else { - return null; - } + return doc != null ? ToulouseArchivesParser.parseFonds(doc, cote) : null; } @Override diff --git a/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulouseParser.java b/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulouseArchivesParser.java similarity index 99% rename from src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulouseParser.java rename to src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulouseArchivesParser.java index 57c2e74..931afda 100644 --- a/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulouseParser.java +++ b/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulouseArchivesParser.java @@ -21,7 +21,7 @@ import com.github.donvip.glamscrap.domain.Fonds; import com.github.donvip.glamscrap.domain.Notice; -class ToulouseParser extends Parser { +class ToulouseArchivesParser extends Parser { private static final Logger LOGGER = LogManager.getLogger(); diff --git a/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulousePhotosParser.java b/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulousePhotosParser.java new file mode 100644 index 0000000..61f719d --- /dev/null +++ b/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulousePhotosParser.java @@ -0,0 +1,35 @@ +package com.github.donvip.glamscrap.institutions.toulouse; + +import java.util.List; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import com.github.donvip.glamscrap.Gwt.GwtResponse; +import com.github.donvip.glamscrap.Parser; +import com.github.donvip.glamscrap.domain.Fonds; + +public class ToulousePhotosParser extends Parser { + + private static final Logger LOGGER = LogManager.getLogger(); + + public static Fonds parseFonds(GwtResponse basket, List medias, String cote) { + if (basket != null && medias != null) { + final Fonds f = new Fonds(cote); + // 0. Search for title + //int idx = basket.indexOf("\"com.keepeek.kpk360.shared.transport.UserLightTransport/"); + //idx = basket.lastIndexOf("\"", idx - 3); + //f.setTitle(basket.substring(idx + 1, basket.indexOf("\"", idx + 1))); + // 1. Search for expected number of notices (information always displayed) + try { + f.setExpectedNotices(Integer.valueOf("617")); // TODO + } catch (RuntimeException e) { + LOGGER.warn("Unable to fetch number of notices for {}", cote); + } + return f; + } else { + LOGGER.warn("Couldn't parse fonds for: {}", cote); + return null; + } + } +} diff --git a/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulousePhotothequeGlamScrap.java b/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulousePhotothequeGlamScrap.java new file mode 100644 index 0000000..a690d1a --- /dev/null +++ b/src/main/java/com/github/donvip/glamscrap/institutions/toulouse/ToulousePhotothequeGlamScrap.java @@ -0,0 +1,145 @@ +package com.github.donvip.glamscrap.institutions.toulouse; + +import java.io.IOException; +import java.net.http.HttpRequest; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import javax.script.ScriptException; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import com.github.donvip.glamscrap.GlamScrap; +import com.github.donvip.glamscrap.Gwt; +import com.github.donvip.glamscrap.Gwt.GwtResponse; +import com.github.donvip.glamscrap.Gwt.TypedValue; +import com.github.donvip.glamscrap.domain.Fonds; +import com.github.donvip.glamscrap.domain.Notice; +import com.github.donvip.glamscrap.wikidata.Author; + +// https://phototheque.toulouse.fr/api/doc/ +public class ToulousePhotothequeGlamScrap extends GlamScrap { + + private static final Logger LOGGER = LogManager.getLogger(); + + private static final String BASE_URL = "https://phototheque.toulouse.fr/"; + + private static final Map ALBUMS = new HashMap<>(); + static { + ALBUMS.put("bBxZc08Cb", new Album(1, false)); + } + + public ToulousePhotothequeGlamScrap() { + super("toulouse_photos"); + } + + @Override + protected Album getAlbum(String cote) { + return ALBUMS.get(cote); + } + + @Override + protected Range getAllowedGap(String cote) { + return null; + } + + @Override + protected String getBaseUrl() { + return BASE_URL; + } + + @Override + public String getInstitution() { + return "Ville de Toulouse"; + } + + @Override + protected List fetchAllFonds() throws IOException { + return List.of(); + } + + @Override + protected void postScrapFonds(Fonds f) throws IOException { + // Do nothing + } + + @Override + protected Notice searchNotice(Fonds f, int i, int j, boolean fetch) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Fonds createNewFonds(String cote) throws IOException { + try { + GwtResponse basket = Gwt.decodeResponse(fetchPost(shareGetBasketAction(cote))); + LOGGER.info(basket); + List medias = new ArrayList<>(); + boolean finished = false; + int index = 0; + while (!finished) { + GwtResponse media = Gwt.decodeResponse(fetchPost(shareGetBasketMediasListAction(cote, index))); + LOGGER.info(media); + medias.add(media); + index += 60; // FIXME + finished = index > 617;//FIXME + } + return ToulousePhotosParser.parseFonds(basket, medias, cote); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException(e); + } catch (ScriptException e) { + throw new IOException(e); + } + } + + protected static HttpRequest shareGetBasketAction(String sharedBasketIdentifier) { + return keepeekAction("ShareGetBasketAction", "com.keepeek.kpk360.shared.dispatch.share.basket.ShareGetBasketAction/3056539714", List.of( + new TypedValue(Gwt.STRING, null, null, -1, List.of()), + shareParameters(sharedBasketIdentifier))); + } + + protected static HttpRequest shareGetBasketMediasListAction(String sharedBasketIdentifier, int index) { + return keepeekAction("ShareGetBasketMediasListAction", "com.keepeek.kpk360.shared.dispatch.share.basket.ShareGetBasketMediasListAction/840037237", List.of( + new TypedValue(null, Gwt.INTEGER, null, -1, List.of()), + new TypedValue(null, Gwt.INTEGER, null, index, List.of()), + new TypedValue(null, Gwt.BOOLEAN, null, 0, List.of()), + new TypedValue(Gwt.STRING, null, "DESC", -1, List.of()), + new TypedValue(Gwt.STRING, null, "", -1, List.of()), + new TypedValue(Gwt.STRING, null, null, -1, List.of()), + shareParameters(sharedBasketIdentifier))); + } + + protected static HttpRequest keepeekAction(String urlAction, String actionQualifiedClass, List arguments) { + return Gwt.request(BASE_URL + "dispatch/" + urlAction, + "https://phototheque.toulouse.fr/keepeek360/", "2ED7D09E9D2EBA03EA03E51C1582B244", "AE4175F0BB42AE91733577263A0417ED", + "com.gwtplatform.dispatch.rpc.shared.DispatchService", "execute", List.of( + new TypedValue(Gwt.STRING, null, null, -1, List.of()), + new TypedValue("com.gwtplatform.dispatch.rpc.shared.Action", actionQualifiedClass, null, -1, arguments))); + } + + private static TypedValue shareParameters(String sharedBasketIdentifier) { + return new TypedValue(null, "com.keepeek.kpk360.shared.dispatch.common.share.ShareParameters/511658636", null, -1, List.of( + new TypedValue(Gwt.STRING, null, "fr", -1, List.of()), + new TypedValue(Gwt.STRING, null, sharedBasketIdentifier, -1, List.of()), + new TypedValue(Gwt.STRING, null, "https://phototheque.toulouse.fr/", -1, List.of()))); + } + + @Override + public String getOtherFields(Notice n) { + return ""; + } + + @Override + public List getCategories(Notice n) { + return List.of(); + } + + @Override + public Map getPredefinedAuthors() { + return Map.of(); + } +} diff --git a/src/main/resources/config.windows.props b/src/main/resources/config.windows.props index 4e03ecf..79d6cd4 100644 --- a/src/main/resources/config.windows.props +++ b/src/main/resources/config.windows.props @@ -21,7 +21,7 @@ considerTemponym = false # Path to TreeTagger home directory ################################### # Ensure there is no white space in path (try to escape white spaces) -treeTaggerHome = C:\\GIT\\GlamScrap\\TreeTagger\\windows +treeTaggerHome = C:\\git\\GlamScrap\\TreeTagger\\windows # This one is only necessary if you want to process chinese documents. chineseTokenizerPath = SET ME IN CONFIG.PROPS! (e.g., /home/jannik/treetagger/chinese-tokenizer) diff --git a/src/test/java/com/github/donvip/glamscrap/HttpRequestBodyTestUtility.java b/src/test/java/com/github/donvip/glamscrap/HttpRequestBodyTestUtility.java new file mode 100644 index 0000000..8b3a82d --- /dev/null +++ b/src/test/java/com/github/donvip/glamscrap/HttpRequestBodyTestUtility.java @@ -0,0 +1,49 @@ +package com.github.donvip.glamscrap; + +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.concurrent.Flow.Subscriber; +import java.util.concurrent.Flow.Subscription; + +public class HttpRequestBodyTestUtility { + + public static String extractBody(HttpRequest httpRequest) { + return httpRequest.bodyPublisher().map(p -> { + var bodySubscriber = HttpResponse.BodySubscribers.ofString(StandardCharsets.UTF_8); + var flowSubscriber = new HttpRequestBodyTestUtility.StringSubscriber(bodySubscriber); + p.subscribe(flowSubscriber); + return bodySubscriber.getBody().toCompletableFuture().join(); + }).orElseThrow(); + } + + static final class StringSubscriber implements Subscriber { + final HttpResponse.BodySubscriber wrapped; + + StringSubscriber(HttpResponse.BodySubscriber wrapped) { + this.wrapped = wrapped; + } + + @Override + public void onSubscribe(Subscription subscription) { + wrapped.onSubscribe(subscription); + } + + @Override + public void onNext(ByteBuffer item) { + wrapped.onNext(List.of(item)); + } + + @Override + public void onError(Throwable throwable) { + wrapped.onError(throwable); + } + + @Override + public void onComplete() { + wrapped.onComplete(); + } + } +} diff --git a/src/test/java/com/github/donvip/glamscrap/institutions/toulouse/ToulousePhotothequeGlamScrapTest.java b/src/test/java/com/github/donvip/glamscrap/institutions/toulouse/ToulousePhotothequeGlamScrapTest.java new file mode 100644 index 0000000..15b28f7 --- /dev/null +++ b/src/test/java/com/github/donvip/glamscrap/institutions/toulouse/ToulousePhotothequeGlamScrapTest.java @@ -0,0 +1,22 @@ +package com.github.donvip.glamscrap.institutions.toulouse; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import com.github.donvip.glamscrap.HttpRequestBodyTestUtility; + +class ToulousePhotothequeGlamScrapTest { + + @Test + void testShareGetBasketAction() { + assertEquals("7|0|11|https://phototheque.toulouse.fr/keepeek360/|2ED7D09E9D2EBA03EA03E51C1582B244|com.gwtplatform.dispatch.rpc.shared.DispatchService|execute|java.lang.String/2004016611|com.gwtplatform.dispatch.rpc.shared.Action|com.keepeek.kpk360.shared.dispatch.share.basket.ShareGetBasketAction/3056539714|com.keepeek.kpk360.shared.dispatch.common.share.ShareParameters/511658636|fr|bBxZc08Cb|https://phototheque.toulouse.fr/|1|2|3|4|2|5|6|0|7|0|8|9|10|11|", + HttpRequestBodyTestUtility.extractBody(ToulousePhotothequeGlamScrap.shareGetBasketAction("bBxZc08Cb"))); + } + + @Test + void testShareGetBasketMediasListAction() { + assertEquals("7|0|15|https://phototheque.toulouse.fr/keepeek360/|2ED7D09E9D2EBA03EA03E51C1582B244|com.gwtplatform.dispatch.rpc.shared.DispatchService|execute|java.lang.String/2004016611|com.gwtplatform.dispatch.rpc.shared.Action|com.keepeek.kpk360.shared.dispatch.share.basket.ShareGetBasketMediasListAction/840037237|java.lang.Integer/3438268394|java.lang.Boolean/476441737|DESC||com.keepeek.kpk360.shared.dispatch.common.share.ShareParameters/511658636|fr|bBxZc08Cb|https://phototheque.toulouse.fr/|1|2|3|4|2|5|6|0|7|8|-1|8|60|9|0|10|11|0|12|13|14|15|", + HttpRequestBodyTestUtility.extractBody(ToulousePhotothequeGlamScrap.shareGetBasketMediasListAction("bBxZc08Cb", 60))); + } +}