Skip to content

Commit

Permalink
Merge pull request #349 from metafacture/extract-element
Browse files Browse the repository at this point in the history
Replace unreleased ScriptExtractor with generic ElementExtractor
  • Loading branch information
fsteeg authored Dec 9, 2020
2 parents 4c2eb8d + 99bc941 commit b7e78ea
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,30 @@
import org.metafacture.framework.helpers.DefaultObjectPipe;

/**
* Extracts the first script from an HTML document
* Extracts the the specified element from an HTML document
*
* @author Fabian Steeg
*/
@Description("Extracts the first script from an HTML document")
@Description("Extracts the specified element from an HTML document")
@In(Reader.class)
@Out(String.class)
@FluxCommand("extract-script")
public class ScriptExtractor extends DefaultObjectPipe<Reader, ObjectReceiver<String>> {
@FluxCommand("extract-element")
public class ElementExtractor extends DefaultObjectPipe<Reader, ObjectReceiver<String>> {
private String selector;

/**
* @param selector The CSS-style jsoup selector, see https://jsoup.org/cookbook/extracting-data/selector-syntax
*/
public ElementExtractor(final String selector) {
this.selector = selector;
}

@Override
public void process(final Reader reader) {
try {
Document document = Jsoup.parse(IOUtils.toString(reader));
Element firstScript = document.select("script").first();
getReceiver().process(firstScript.data());
Element firstElement = document.select(selector).first();
getReceiver().process(firstElement.data());
} catch (IOException e) {
e.printStackTrace();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@
# limitations under the License.
#
decode-html org.metafacture.html.HtmlDecoder
extract-script org.metafacture.html.ScriptExtractor
extract-element org.metafacture.html.ElementExtractor
Original file line number Diff line number Diff line change
Expand Up @@ -28,37 +28,40 @@
import org.mockito.MockitoAnnotations;

/**
* Tests for {@link ScriptExtractor}.
* Tests for {@link ElementExtractor}.
*
* @author Fabian Steeg
*
*/
public final class ScriptExtractorTest {
public final class ElementExtractorTest {

private static final StringReader IN = new StringReader("<html><script>{\"code\":\"yo\"}");
private static final StringReader IN = new StringReader("<html>"
+ "<script data-test='site-head-data'>{\"code\":\"hey\"}</script>"
+ "<script data-test='model-linked-data'>{\"code\":\"yo\"}");

private static final String OUT = "{\"code\":\"yo\"}";

private ScriptExtractor scriptExtractor;
private ElementExtractor elementExtractor;

@Mock
private ObjectReceiver<String> receiver;

@Before
public void setup() {
MockitoAnnotations.initMocks(this);
scriptExtractor = new ScriptExtractor();
scriptExtractor.setReceiver(receiver);
elementExtractor = new ElementExtractor("script[data-test=model-linked-data]");
elementExtractor.setReceiver(receiver);
}

@Test
public void testShouldProcessRecordsFollowedbySeparator() {
scriptExtractor.process(IN);
elementExtractor.process(IN);
verify(receiver).process(OUT);
verifyNoMoreInteractions(receiver);
}

@After
public void cleanup() {
scriptExtractor.closeStream();
elementExtractor.closeStream();
}
}

0 comments on commit b7e78ea

Please sign in to comment.