diff --git a/metafacture-io/src/main/java/org/metafacture/io/HttpOpener.java b/metafacture-io/src/main/java/org/metafacture/io/HttpOpener.java index 307a6548..adf5de30 100644 --- a/metafacture-io/src/main/java/org/metafacture/io/HttpOpener.java +++ b/metafacture-io/src/main/java/org/metafacture/io/HttpOpener.java @@ -1,5 +1,5 @@ /* - * Copyright 2013, 2022 Deutsche Nationalbibliothek et al + * Copyright 2013, 2023 Deutsche Nationalbibliothek et al * * Licensed under the Apache License, Version 2.0 the "License"; * you may not use this file except in compliance with the License. @@ -32,10 +32,12 @@ import java.io.SequenceInputStream; import java.net.HttpURLConnection; import java.net.URL; +import java.net.URLDecoder; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.regex.Pattern; +import java.util.zip.GZIPInputStream; /** * Opens an {@link HttpURLConnection} and passes a reader to the receiver. @@ -43,29 +45,39 @@ * @author Christoph Böhme * @author Jan Schnasse * @author Jens Wille + * @author Pascal Christoph (dr0i) */ -@Description("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset` and `Content-Type`, as well as generic headers (separated by `\\n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header = `*/*`, `Accept-Charset` header (`encoding`) = `UTF-8`, `errorPrefix` = `ERROR: `.") +@Description("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset`, `Accept-Encoding`, `Content-Encoding` and `Content-Type`, as well as generic headers (separated by `\\n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header (`accept`) = `*/*`, `Accept-Charset` header (`acceptcharset`) = `UTF-8`, `errorprefix` = `ERROR: `.") @In(String.class) @Out(Reader.class) @FluxCommand("open-http") public final class HttpOpener extends DefaultObjectPipe> { - public static final String ACCEPT_DEFAULT = "*/*"; public static final String ACCEPT_HEADER = "accept"; + public static final String ACCEPT_CHARSET_HEADER = "accept-charset"; + public static final String ACCEPT_ENCODING_HEADER = "accept-encoding"; + public static final String CONTENT_ENCODING_HEADER = "content-encoding"; public static final String CONTENT_TYPE_HEADER = "content-type"; + + public static final String ACCEPT_DEFAULT = "*/*"; + public static final String CHARSET_DEFAULT = "UTF-8"; public static final String DEFAULT_PREFIX = "ERROR: "; - public static final String ENCODING_DEFAULT = "UTF-8"; - public static final String ENCODING_HEADER = "accept-charset"; + public static final String HEADER_FIELD_SEPARATOR = "\n"; + public static final String HEADER_VALUE_SEPARATOR = ":"; public static final String INPUT_DESIGNATOR = "@-"; + public static final String MIME_PARAMETER_CHARSET = "charset"; + public static final String MIME_PARAMETER_SEPARATOR = ";"; + public static final String MIME_PARAMETER_VALUE_SEPARATOR = "="; public static final String DEFAULT_METHOD_NAME = "GET"; public static final Method DEFAULT_METHOD = Method.valueOf(DEFAULT_METHOD_NAME); - public static final String HEADER_FIELD_SEPARATOR = "\n"; - public static final String HEADER_VALUE_SEPARATOR = ":"; - private static final Pattern HEADER_FIELD_SEPARATOR_PATTERN = Pattern.compile(HEADER_FIELD_SEPARATOR); private static final Pattern HEADER_VALUE_SEPARATOR_PATTERN = Pattern.compile(HEADER_VALUE_SEPARATOR); + private static final Pattern MIME_PARAMETER_SEPARATOR_PATTERN = Pattern.compile(MIME_PARAMETER_SEPARATOR); + + private static final int ALLOWED_REDIRECTIONS = 3; + private static final int CONNECTION_TIMEOUT = 11000; private final Map headers = new HashMap<>(); @@ -118,7 +130,7 @@ public boolean getResponseHasBody() { */ public HttpOpener() { setAccept(ACCEPT_DEFAULT); - setEncoding(ENCODING_DEFAULT); + setAcceptCharset(CHARSET_DEFAULT); setErrorPrefix(DEFAULT_PREFIX); setMethod(DEFAULT_METHOD); setUrl(INPUT_DESIGNATOR); @@ -137,43 +149,59 @@ public void setAccept(final String accept) { } /** - * Sets the HTTP request body. The default value for the request body is - * {@value INPUT_DESIGNATOR} if the {@link #setMethod(Method) request - * method} accepts a request body, which means it will use the {@link - * #process(String) input data} data as request body if the input has - * not already been used; otherwise, no request body will be set by - * default. + * Sets the HTTP {@value CONTENT_TYPE_HEADER} header value. This is a + * MIME type such as {@code text/plain} or {@code application/json}. * - *

If a request body has been set, but the request method does not - * accept a body, the method may be changed to {@code POST}. + * @param contentType MIME type to use for the HTTP content-type header + */ + public void setContentType(final String contentType) { + setHeader(CONTENT_TYPE_HEADER, contentType); + } + + /** + * Sets the HTTP {@value ACCEPT_CHARSET_HEADER} header value. This is the + * preferred charset for the HTTP response. + * The default charset is {@value CHARSET_DEFAULT}. * - * @param body the request body + * @param charset name of the charset used for the accept-charset HTTP header */ - public void setBody(final String body) { - this.body = body; + public void setAcceptCharset(final String charset) { + setHeader(ACCEPT_CHARSET_HEADER, charset); } /** - * Sets the HTTP {@value CONTENT_TYPE_HEADER} header value. This is a - * MIME type such as {@code text/plain} or {@code application/json}. + * @deprecated Use {@link #setAcceptCharset} instead. + * @param charset name of the charset used for the accept-charset HTTP header + */ + @Deprecated + public void setEncoding(final String charset) { + setAcceptCharset(charset); + } + + /** + * Sets the HTTP {@value ACCEPT_ENCODING_HEADER} header value. This is the + * preferred content encoding for the HTTP response. It accepts HTTP compression. + * Allowed values are i.a. "gzip" and "Brotli". + * The default for the content encoding is null, which means "no compression". * - * @param contentType MIME type to use for the HTTP content-type header + * @param acceptEncoding name of content encoding used for the accept-encoding HTTP + * header */ - public void setContentType(final String contentType) { - setHeader(CONTENT_TYPE_HEADER, contentType); + public void setAcceptEncoding(final String acceptEncoding) { + setHeader(ACCEPT_ENCODING_HEADER, acceptEncoding); } /** - * Sets the HTTP {@value ENCODING_HEADER} header value. This is the - * preferred encoding for the HTTP response. Additionally, the encoding - * is used for reading the HTTP response if it does not specify a content - * encoding. The default for the encoding is {@value ENCODING_DEFAULT}. + * Sets the HTTP {@value CONTENT_ENCODING_HEADER} header value. This is the + * content encoding for the HTTP request. It enables HTTP compression. + * Allowed values are "gzip". + * The default for the content encoding is null, which means "no compression". * - * @param encoding name of the encoding used for the accept-charset HTTP + * @param contentEncoding name of content encoding used for the content-encoding HTTP * header */ - public void setEncoding(final String encoding) { - setHeader(ENCODING_HEADER, encoding); + public void setContentEncoding(final String contentEncoding) { + setHeader(CONTENT_ENCODING_HEADER, contentEncoding); } /** @@ -239,28 +267,40 @@ public void setUrl(final String url) { this.url = url; } + /** + * Sets the HTTP request body. The default value for the request body is + * {@value INPUT_DESIGNATOR} if the {@link #setMethod(Method) request + * method} accepts a request body, which means it will use the {@link + * #process(String) input data} data as request body if the input has + * not already been used; otherwise, no request body will be set by + * default. + * + *

If a request body has been set, but the request method does not + * accept a body, the method may be changed to {@code POST}. + * + * @param body the request body + */ + public void setBody(final String body) { + this.body = body; + } + @Override public void process(final String input) { try { final String requestUrl = getInput(input, url); final String requestBody = getInput(input, - body == null && method.getRequestHasBody() ? INPUT_DESIGNATOR : body); - - final HttpURLConnection connection = - (HttpURLConnection) new URL(requestUrl).openConnection(); + body == null && method.getRequestHasBody() ? INPUT_DESIGNATOR : body); - connection.setRequestMethod(method.name()); - headers.forEach(connection::addRequestProperty); - - if (requestBody != null) { - connection.setDoOutput(true); - connection.getOutputStream().write(requestBody.getBytes()); - } + final URL urlToOpen = new URL(requestUrl); + final HttpURLConnection connection = requestBody != null ? + doOutput(urlToOpen, requestBody) : doRedirects(urlToOpen); final InputStream inputStream = getInputStream(connection); - final String contentEncoding = getEncoding(connection.getContentEncoding()); + final String charset = getContentCharset(connection); - getReceiver().process(new InputStreamReader(inputStream, contentEncoding)); + getReceiver().process(new InputStreamReader( + "gzip".equalsIgnoreCase(connection.getContentEncoding()) ? + new GZIPInputStream(inputStream) : inputStream, charset)); } catch (final IOException e) { throw new MetafactureException(e); @@ -287,6 +327,46 @@ else if (inputUsed) { return result; } + private HttpURLConnection doOutput(final URL urlToOpen, final String requestBody) throws IOException { + final HttpURLConnection connection = openConnection(urlToOpen); + + connection.setDoOutput(true); + connection.getOutputStream().write(requestBody.getBytes()); + + return connection; + } + + private HttpURLConnection doRedirects(final URL startingUrl) throws IOException { + URL urlToFollow = startingUrl; + + for (int i = 0; i < ALLOWED_REDIRECTIONS; ++i) { + final HttpURLConnection connection = openConnection(urlToFollow); + connection.setInstanceFollowRedirects(false); // Make the logic below easier to detect redirections + + switch (connection.getResponseCode()) { + case HttpURLConnection.HTTP_MOVED_PERM: + case HttpURLConnection.HTTP_MOVED_TEMP: + final String location = URLDecoder.decode(connection.getHeaderField("Location"), "UTF-8"); + urlToFollow = new URL(urlToFollow, location); // Deal with relative URLs + break; + default: + return connection; + } + } + + throw new IOException("Too many redirects"); + } + + private HttpURLConnection openConnection(final URL urlToOpen) throws IOException { + final HttpURLConnection connection = (HttpURLConnection) urlToOpen.openConnection(); + + connection.setRequestMethod(method.name()); + connection.setConnectTimeout(CONNECTION_TIMEOUT); + headers.forEach(connection::setRequestProperty); + + return connection; + } + private InputStream getInputStream(final HttpURLConnection connection) throws IOException { try { return connection.getInputStream(); @@ -312,8 +392,23 @@ private InputStream getErrorStream(final InputStream errorStream) { } } - private String getEncoding(final String contentEncoding) { - return contentEncoding != null ? contentEncoding : headers.get(ENCODING_HEADER); + private String getContentCharset(final HttpURLConnection connection) { + final String contentType = connection.getContentType(); + + if (contentType != null) { + final String[] parts = MIME_PARAMETER_SEPARATOR_PATTERN.split(contentType); + + for (int i = 1; i < parts.length; ++i) { + final String parameter = parts[i].trim(); + final int index = parameter.indexOf(MIME_PARAMETER_VALUE_SEPARATOR); + + if (index != -1 && MIME_PARAMETER_CHARSET.equalsIgnoreCase(parameter.substring(0, index))) { + return parameter.substring(index + 1); + } + } + } + + return CHARSET_DEFAULT; } } diff --git a/metafacture-io/src/test/java/org/metafacture/io/HttpOpenerTest.java b/metafacture-io/src/test/java/org/metafacture/io/HttpOpenerTest.java index 3ef2974a..245ba4d1 100644 --- a/metafacture-io/src/test/java/org/metafacture/io/HttpOpenerTest.java +++ b/metafacture-io/src/test/java/org/metafacture/io/HttpOpenerTest.java @@ -23,6 +23,8 @@ import com.github.tomakehurst.wiremock.client.ResponseDefinitionBuilder; import com.github.tomakehurst.wiremock.client.WireMock; import com.github.tomakehurst.wiremock.core.WireMockConfiguration; +import com.github.tomakehurst.wiremock.http.HttpHeader; +import com.github.tomakehurst.wiremock.http.HttpHeaders; import com.github.tomakehurst.wiremock.http.RequestMethod; import com.github.tomakehurst.wiremock.junit.WireMockRule; import com.github.tomakehurst.wiremock.matching.RequestPatternBuilder; @@ -39,13 +41,15 @@ import org.mockito.junit.MockitoJUnit; import org.mockito.junit.MockitoRule; -import static org.mockito.Mockito.times; - +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.Reader; import java.util.Arrays; import java.util.function.BiConsumer; import java.util.function.Consumer; +import java.util.zip.GZIPOutputStream; + +import static org.mockito.Mockito.times; /** * Tests for class {@link HttpOpener}. @@ -63,6 +67,20 @@ public final class HttpOpenerTest { private static final String REQUEST_BODY = "request body"; private static final String RESPONSE_BODY = "response bödy"; // UTF-8 + private static byte[] GZIPPED_RESPONSE_BODY; + static { + try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { + final GZIPOutputStream gzip = new GZIPOutputStream(out); + gzip.write(RESPONSE_BODY.getBytes("UTF-8")); + gzip.close(); + + GZIPPED_RESPONSE_BODY = out.toByteArray(); + } + catch (final IOException e) { + e.printStackTrace(); + } + } + @Rule public MockitoRule mockitoRule = MockitoJUnit.rule(); @@ -226,38 +244,36 @@ public void shouldPerformPostRequestWithContentTypeParameter() throws IOExceptio } @Test - public void shouldPerformPostRequestWithEncodingParameter() throws IOException { - final String encoding = "ISO-8859-1"; + public void shouldPerformPostRequestWithCharsetParameter() throws IOException { + shouldPerformPostRequestWithCharsetParameter(null); + } + + @Test + public void shouldPerformPostRequestWithCharsetParameterAndContentTypeResponseHeader() throws IOException { + shouldPerformPostRequestWithCharsetParameter("expected: but was:"); + } + + private void shouldPerformPostRequestWithCharsetParameter(final String expectedMessage) throws IOException { + final String charset = "ISO-8859-1"; final String header = "Accept-Charset"; - final StringValuePattern value = WireMock.equalTo(encoding); + final StringValuePattern value = WireMock.equalTo(charset); + String actualMessage; try { shouldPerformRequest(REQUEST_BODY, HttpOpener.Method.POST, (o, u) -> { o.setMethod(HttpOpener.Method.POST); o.setUrl(u); - o.setEncoding(encoding); - }, s -> s.withHeader(header, value), q -> q.withHeader(header, value), null); + o.setAcceptCharset(charset); + }, s -> s.withHeader(header, value), q -> q.withHeader(header, value), expectedMessage != null ? + r -> r.withHeader(HttpOpener.CONTENT_TYPE_HEADER, "text/plain; charset=" + charset) : null); + + actualMessage = null; } catch (final ComparisonFailure e) { - Assert.assertEquals("expected: but was:", e.getMessage()); + actualMessage = e.getMessage(); } - } - - @Test - public void shouldPerformPostRequestWithEncodingParameterAndContentEncodingResponseHeader() throws IOException { - final String encoding = "ISO-8859-1"; - final String header = "Accept-Charset"; - final StringValuePattern value = WireMock.equalTo(encoding); - shouldPerformRequest(REQUEST_BODY, HttpOpener.Method.POST, (o, u) -> { - o.setMethod(HttpOpener.Method.POST); - o.setUrl(u); - o.setEncoding(encoding); - }, - s -> s.withHeader(header, value), - q -> q.withHeader(header, value), - r -> r.withHeader("Content-Encoding", "UTF-8") - ); + Assert.assertEquals(expectedMessage, actualMessage); } @Test @@ -278,6 +294,12 @@ public void shouldPerformGetRequestWithErrorResponseAndWithoutErrorPrefixParamet null, null, WireMock.badRequest().withBody(RESPONSE_BODY), RESPONSE_BODY); } + @Test + public void shouldPerformGetRequestWithGzippedContentEncoding() throws IOException { + shouldPerformRequest(TEST_URL, HttpOpener.Method.GET, (o, u) -> o.setAcceptEncoding("gzip"), + null, null, WireMock.ok().withBody(GZIPPED_RESPONSE_BODY).withHeader(HttpOpener.CONTENT_ENCODING_HEADER, "gzip"), RESPONSE_BODY); + } + private void shouldPerformRequest(final String input, final HttpOpener.Method method, final BiConsumer consumer, final String... headers) throws IOException { shouldPerformRequest(input, method, consumer, s -> Arrays.stream(headers).forEach(h -> s.withHeader(h, TEST_VALUE)),