Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Progressive parsing with StreamParser #2096

Merged
merged 14 commits into from
Jan 5, 2024
10 changes: 10 additions & 0 deletions src/main/java/org/jsoup/Connection.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.jsoup.helper.RequestAuthenticator;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.jsoup.parser.StreamParser;
import org.jspecify.annotations.Nullable;

import javax.net.ssl.SSLSocketFactory;
Expand Down Expand Up @@ -883,6 +884,15 @@ <p>Other body methods (like bufferUp, body, parse, etc) will generally not work
@return the response body input stream
*/
BufferedInputStream bodyStream();

/**
Returns a {@link StreamParser} that will parse the Response progressively.
* @return a StreamParser, prepared to parse this response.
* @throws IOException if an IO exception occurs preparing the parser.
*/
default StreamParser streamParser() throws IOException {
throw new UnsupportedOperationException();
}
}

/**
Expand Down
245 changes: 163 additions & 82 deletions src/main/java/org/jsoup/helper/DataUtil.java

Large diffs are not rendered by default.

32 changes: 30 additions & 2 deletions src/main/java/org/jsoup/helper/HttpConnection.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,19 @@
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.jsoup.parser.StreamParser;
import org.jsoup.parser.TokenQueue;
import org.jspecify.annotations.Nullable;

import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSocketFactory;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.net.CookieManager;
Expand Down Expand Up @@ -950,22 +953,47 @@ public String contentType() {
return contentType;
}

public Document parse() throws IOException {
/** Called from parse() or streamParser(), validates and prepares the input stream, and aligns common settings. */
private InputStream prepareParse() {
Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response");
InputStream stream = bodyStream;
if (byteData != null) { // bytes have been read in to the buffer, parse that
stream = new ByteArrayInputStream(byteData.array());
inputStreamRead = false; // ok to reparse if in bytes
}
Validate.isFalse(inputStreamRead, "Input stream already read and parsed, cannot re-read.");
Validate.notNull(stream);
inputStreamRead = true;
return stream;
}

@Override public Document parse() throws IOException {
InputStream stream = prepareParse();
Document doc = DataUtil.parseInputStream(stream, charset, url.toExternalForm(), req.parser());
doc.connection(new HttpConnection(req, this)); // because we're static, don't have the connection obj. // todo - maybe hold in the req?
charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly
inputStreamRead = true;
safeClose();
return doc;
}

@Override public StreamParser streamParser() throws IOException {
InputStream stream = prepareParse();
String baseUri = url.toExternalForm();
DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(stream, charset, baseUri, req.parser());
// note that there may be a document in CharsetDoc as a result of scanning meta-data -- but as requires a stream parse, it is not used here. todo - revisit.

// set up the stream parser and rig this connection up to the parsed doc:
StreamParser streamer = new StreamParser(req.parser());
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charsetDoc.charset));
jhy marked this conversation as resolved.
Dismissed
Show resolved Hide resolved
DataUtil.maybeSkipBom(reader, charsetDoc);
streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
streamer.document().connection(new HttpConnection(req, this));
charset = charsetDoc.charset.name();

// we don't safeClose() as in parse(); caller must close streamParser to close InputStream stream
return streamer;
}

private void prepareByteData() {
Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body");
if (bodyStream != null && byteData == null) {
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/jsoup/internal/ControllableInputStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ public int read(byte[] b, int off, int len) throws IOException {
remaining -= read;
return read;
} catch (SocketTimeoutException e) {
if (expired())
throw e;
return 0;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/parser/CharacterReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public final class CharacterReader {

public CharacterReader(Reader input, int sz) {
Validate.notNull(input);
Validate.isTrue(input.markSupported());
Validate.isTrue(input.markSupported(), "The supplied Reader must support mark(), but does not.");
reader = input;
charBuf = new char[Math.min(sz, maxBufferLen)];
bufferUp();
Expand Down
3 changes: 1 addition & 2 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,9 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) {
fragmentParsing = false;
}

@Override List<Node> parseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) {
@Override List<Node> doParseFragment(@Nullable Element context) {
// context may be null
state = HtmlTreeBuilderState.Initial;
initialiseParse(new StringReader(inputFragment), baseUri, parser);
contextElement = context;
fragmentParsing = true;
Element root = null;
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ public static List<Node> parseFragment(String fragmentHtml, Element context, Str
*/
public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
return treeBuilder.parseFragment(fragmentXml, baseUri, new Parser(treeBuilder));
return treeBuilder.parseFragment(fragmentXml, null, baseUri, new Parser(treeBuilder));
}

/**
Expand Down
Loading