Skip to content

Commit

Permalink
Generate Pattypan import file for authors in public domain thanks to …
Browse files Browse the repository at this point in the history
…Wikidata
  • Loading branch information
don-vip committed Jul 9, 2022
1 parent 80ffc84 commit 89c0867
Show file tree
Hide file tree
Showing 12 changed files with 689 additions and 26 deletions.
37 changes: 31 additions & 6 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.github.donvip</groupId>
Expand All @@ -18,16 +17,34 @@
<sonar.host.url>https://sonarcloud.io</sonar.host.url>
</properties>

<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-bom</artifactId>
<version>4.0.3</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>

<dependencies>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.17.2</version>
<version>2.18.0</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.17.2</version>
<version>2.18.0</version>
</dependency>
<dependency>
<groupId>org.fusesource.jansi</groupId>
<artifactId>jansi</artifactId>
<version>2.4.0</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
Expand Down Expand Up @@ -59,11 +76,19 @@
<artifactId>uimaj-core</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.2</version>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-repository-sparql</artifactId>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>5.8.2</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
</project>
41 changes: 33 additions & 8 deletions src/main/java/com/github/donvip/archscrap/ArchScrap.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import java.sql.SQLException;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

Expand All @@ -47,6 +48,9 @@
import com.github.donvip.archscrap.archives.toulouse.ToulouseArchScrap;
import com.github.donvip.archscrap.domain.Fonds;
import com.github.donvip.archscrap.domain.Notice;
import com.github.donvip.archscrap.uploadtools.Pattypan;
import com.github.donvip.archscrap.uploadtools.UploadTool;
import com.github.donvip.archscrap.wikidata.Author;

public abstract class ArchScrap implements AutoCloseable {

Expand Down Expand Up @@ -120,7 +124,7 @@ public final void close() throws IOException {
}

public static void usage() {
LOGGER.info("Usage: ArchScrap [paris|toulouse] scrap [<fonds>[,<fonds>]*] | check [<fonds>[,<fonds>]*] | download [<fonds>[,<fonds>]*] | wikicode [<fonds>]| gui");
LOGGER.info("Usage: ArchScrap [paris|toulouse] scrap [<fonds>[,<fonds>]*] | check [<fonds>[,<fonds>]*] | download [<fonds>[,<fonds>]*] | pattypan [<fonds>] | gui");
}

public static void main(String[] args) {
Expand All @@ -139,8 +143,8 @@ public static void main(String[] args) {
case "download":
app.doDownload(args);
break;
case "wikicode":
app.doWikicode(args);
case "pattypan":
app.doUploadTool(args, new Pattypan());
break;
case "gui":
app.launchGui();
Expand All @@ -154,6 +158,21 @@ public static void main(String[] args) {
LOGGER.info("Bye!");
}

private void doUploadTool(String[] args, UploadTool tool) throws IOException {
if (args.length <= 2) {
// Process all fonds
for (Fonds f : fetchAllFonds()) {
tool.writeUploadFile(f, this);
}
} else {
for (String cote : args[2].split(",")) {
tool.writeUploadFile(searchFonds(cote), this);
}
}
}

public abstract String getInstitution();

private static ArchScrap buildApp(String city) {
switch (city) {
case "paris": return new ParisArchScrap();
Expand Down Expand Up @@ -197,10 +216,6 @@ public final void doDownload(String[] args) throws IOException {
}
}

public final void doWikicode(String[] args) throws IOException {
// TODO download files
}

private void checkFonds(Fonds f) {
if (f != null) {
int expected = f.getExpectedNotices();
Expand All @@ -216,13 +231,17 @@ private void checkFonds(Fonds f) {

private void downloadFonds(Fonds f) throws IOException {
if (f != null) {
Path dir = Files.createDirectories(Paths.get("output", city, "fonds", f.getCote()));
Path dir = Files.createDirectories(getDownloadDir(f));
for (Notice n : f.getNotices()) {
downloadImage(n, dir);
}
}
}

public Path getDownloadDir(Fonds f) {
return Paths.get("output", city, "fonds", f.getCote());
}

private void downloadImage(Notice n, Path dir) throws IOException {
if (n.getFilename() == null || n.getDownloadUrl() == null) {
LOGGER.warn("No filename or download URL for {}", n);
Expand Down Expand Up @@ -355,4 +374,10 @@ protected final Document fetch(String doc) throws IOException {
LOGGER.info("Fetching {}{}", getBaseUrl(), doc);
return Jsoup.connect(getBaseUrl() + doc).get();
}

public abstract String getOtherFields(Notice n);

public abstract List<String> getCategories(Notice n);

public abstract Map<String, Author> getPredefinedAuthors();
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import static java.util.stream.Collectors.toMap;

import java.io.IOException;
import java.net.URL;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
Expand All @@ -20,6 +22,7 @@
import com.github.donvip.archscrap.ArchScrap;
import com.github.donvip.archscrap.domain.Fonds;
import com.github.donvip.archscrap.domain.Notice;
import com.github.donvip.archscrap.wikidata.Author;

/**
* https://archives.paris.fr/a/234/catalogues-des-documents-figures/
Expand All @@ -32,10 +35,34 @@ public class ParisArchScrap extends ArchScrap {

private static final int PAGE = 20;

private static final Map<String, Author> PREDEFINED_AUTHORS = new HashMap<>();
static {
PREDEFINED_AUTHORS.put("Durandelle, Louis Emile (photographe)", new Author("Durandelle", "Louis", "photographe"));
PREDEFINED_AUTHORS.put("Barry (photographe)", new Author("Barry", "Jean", "photographe"));
PREDEFINED_AUTHORS.put("Petit, Pierre et fils (photographe)", new Author("Petit", "Pierre", "photographe"));
PREDEFINED_AUTHORS.put("S.I.P. (photographe)", new Author("Société industrielle de photographie", null, "studio photographique"));
PREDEFINED_AUTHORS.put("Vizzanova, F. (photographe)", new Author("Vizzavona", "François", "photographe"));
PREDEFINED_AUTHORS.put("Szepessy, V. de (photographe)", new Author("Szepessy", "Victor", "photographe"));
PREDEFINED_AUTHORS.put("Chevojon (studio photographique)", new Author("Studio Chevojon", null, "agence photographique"));
PREDEFINED_AUTHORS.put("Blanc, Geo (photographe)", new Author("Blanc", "Georges", "photographe"));
PREDEFINED_AUTHORS.put("Nobécourt, F. (photographe)", new Author("Nobécourt", "Fernand", "photographe"));
PREDEFINED_AUTHORS.put("Bernès, Marouteau et Cie (studio photographique)", new Author("Bernès, Marouteau & Cie", null, "agence photographique"));
PREDEFINED_AUTHORS.put("Cade, Paul (photographe)", new Author("Cadé", "Paul", "photographe"));
PREDEFINED_AUTHORS.put("Kollar (photographe)", new Author("Kollar", "François", "photographe"));
PREDEFINED_AUTHORS.put("Gerschel, Aaron (photographe)", new Author("Gerschel", "Aron", "photographe"));
PREDEFINED_AUTHORS.put("Delbo (photographe)", new Author("Bouillot", "Pierre", "photographe"));
PREDEFINED_AUTHORS.put("Harand, F. (photographe)", new Author("Harand", "François", "photographe"));
}

public ParisArchScrap() {
super("paris");
}

@Override
public String getInstitution() {
return "Archives de Paris";
}

@Override
protected Album getAlbum(String cote) {
return null;
Expand All @@ -58,15 +85,17 @@ protected List<Fonds> fetchAllFonds() throws IOException {

@Override
protected Notice searchNotice(Fonds f, int i, int j, boolean fetch) {
String[] tab = f.getExpectedNoticeCotes().get(i-1).split(";");
String[] tab = f.getExpectedNoticeCotes().get(i - 1).split(";");
String cote = tab[0];
Notice n = session.get(Notice.class, cote);
if (n == null && fetch) {
try {
Document desc = fetch(String.format("%s/f/", tab[1]));
String path = String.format("%s/f/", tab[1]);
Document desc = fetch(path);
if (desc != null) {
n = ParisParser.parseNotice(desc, cote);
if (n != null) {
n.setUrl(new URL(getBaseUrl() + path));
session.beginTransaction();
f.getNotices().add(n);
n.setFonds(f);
Expand All @@ -90,6 +119,7 @@ protected Notice searchNotice(Fonds f, int i, int j, boolean fetch) {
protected Fonds createNewFonds(String cote) throws IOException {
Fonds fonds = new Fonds(cote);
fonds.setTitle("Collections photographiques");
fonds.setTemplate("photograph");
Document doc = fetch("tableau/?&debut=0");
int n = extractNumberOfResults(doc);
fonds.setExpectedNotices(n);
Expand Down Expand Up @@ -128,15 +158,15 @@ private static List<String> extractCotes(Document doc) {

@Override
protected void postScrapFonds(Fonds f) throws IOException {
// Enrich notices with data unavailable in notices themselves but only through search... (sic)
// Enrich notices with data available only through search... (sic)
Document doc = fetch("tableau/?");
enrichNotices(f, doc, 16, (n, a) -> {
n.setAuthors(List.of(a));
persist(n);
});
enrichNotices(f, doc, 18, (n, a) -> {
String obs = n.getObservation();
n.setObservation((obs == null ? "" : obs + ';') + "Ouvrage="+a);
n.setObservation((obs == null ? "" : obs + ';') + "Ouvrage=" + a);
persist(n);
});
}
Expand All @@ -148,13 +178,12 @@ private static Map<Integer, String> extractMap(Document doc, int crit) {

private void enrichNotices(Fonds f, Document doc, int crit, BiConsumer<Notice, String> filler) throws IOException {
for (Entry<Integer, String> e : extractMap(doc, crit).entrySet()) {
BiConsumer<Document, String> parser = (d, v) ->
extractCotes(d).stream()
.map(s -> s.split(";")[0])
.map(cote -> f.getNotices().stream().filter(n -> n.getCote().equals(cote)).findFirst()
.orElseThrow(() -> new IllegalStateException("No notice found for cote " + cote)))
.forEach(n -> filler.accept(n, v));
Document results = fetch(String.format("tableau/?&crit1=%d&v_%d_1=%s&v_%d_2=%d", crit, crit, e.getValue(), crit, e.getKey()));
BiConsumer<Document, String> parser = (d, v) -> extractCotes(d).stream().map(s -> s.split(";")[0])
.map(cote -> f.getNotices().stream().filter(n -> n.getCote().equals(cote)).findFirst()
.orElseThrow(() -> new IllegalStateException("No notice found for cote " + cote)))
.forEach(n -> filler.accept(n, v));
Document results = fetch(String.format("tableau/?&crit1=%d&v_%d_1=%s&v_%d_2=%d", crit, crit, e.getValue(),
crit, e.getKey()));
String decodedValue = URLDecoder.decode(e.getValue(), StandardCharsets.ISO_8859_1);
int n = extractNumberOfResults(results);
parser.accept(results, decodedValue);
Expand All @@ -163,4 +192,98 @@ private void enrichNotices(Fonds f, Document doc, int crit, BiConsumer<Notice, S
}
}
}

@Override
public String getOtherFields(Notice n) {
StringBuilder sb = new StringBuilder();
String observation = n.getObservation();
if (observation != null) {
boolean first = true;
for (String skv : observation.split(";")) {
String[] kv = skv.split("=");
if (!first) {
sb.append('\n');
}
first = false;
sb.append("{{Information field|name=").append(kv[0]).append("|value=").append(kv[1]).append("}}");
}
}
return sb.toString();
}

@Override
public List<String> getCategories(Notice n) {
List<String> result = new ArrayList<>();
result.add(switch (n.getClassification()) {
case "Architecture":
yield "Architecture - Archives de Paris";
case "Expositions internationales":
yield "Expositions internationales - Archives de Paris";
case "Fortifications":
yield "Fortifications - Archives de Paris";
case "Métro parisien":
yield "Chantier du métro parisien - Archives de Paris";
case "Mobilier urbain":
yield "Mobilier urbain - Archives de Paris";
case "Rues UPF":
yield "Photographies de rues par l’Union Photographique Française - Archives de Paris";
default:
throw new IllegalArgumentException("Unexpected value: " + n.getClassification());
});
String observation = n.getObservation();
if (observation != null) {
for (String skv : observation.split(";")) {
String[] kv = skv.split("=");
result.add(switch (kv[0]) {
case "Arrondissement":
yield String.format("Historical images of Paris %s arrondissement", kv[1]);
case "Ouvrage":
yield switch (kv[1]) {
case "Lampadaire public":
yield "Historical images of street lights in Paris";
case "Toilettes publiques":
yield "Historical images of public toilets in Paris";
case "Locomotive à vapeur":
yield "Historical images of trains in Paris";
case "Kiosque":
yield "Historical images of kiosks in Paris";
case "Débit de boissons":
yield "Historical images of cafés in Paris";
case "Passerelle", "Pont":
yield "Historical images of bridges in Paris";
case "Immeuble":
yield "Historical images of buildings in Paris";
case "Immeuble à logements":
yield "Historical images of residential buildings in Paris";
case "Magasin de commerce", "Devanture de boutique":
yield "Historical images of commerce buildings in Paris";
case "Pavillon d'exposition":
yield "Historical images of cultural buildings in Paris";
case "Fontaine":
yield "Fountains in Paris";
case "Monument":
yield "Monuments in Paris";
case "Sculpture":
yield "Sculptures in Paris";
case "Statue":
yield "Statues in Paris";
case "Hôtel de Ville":
yield "Historical images of Hôtel de Ville de Paris";
default:
throw new IllegalArgumentException("Unexpected value: " + kv[1]);
};
case "Quartier":
yield null;
default:
throw new IllegalArgumentException("Unexpected value: " + kv[0]);
});
}
}
return result;
}

@Override
public Map<String, Author> getPredefinedAuthors() {
return PREDEFINED_AUTHORS;
}
}
Loading

0 comments on commit 89c0867

Please sign in to comment.