From cda8467f9568ff5f532d155a37868c7ed0435654 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 00:49:43 +0800 Subject: [PATCH 01/15] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 68bf76d9c..cdf618211 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 997eb812c..7fe2ba6ff 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index e2c0f741c..c6b70bce1 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.8.0 + 0.8.1-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 05d6100a6..daf0c7fdc 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 449fcf243..e015567c2 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index b73f6fd27..732c23bd0 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 3ec15f9af..d1225dda2 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 715d7731b..a430772b6 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 From faf7e1559aa98a3bae6421fab1396257324e7273 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 20:31:43 +0800 Subject: [PATCH 02/15] Update README for the webmagic version. --- README-zh.md | 4 ++-- README.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README-zh.md b/README-zh.md index 62b3c9a5e..c3c4b72ea 100644 --- a/README-zh.md +++ b/README-zh.md @@ -39,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.7.5 + ${webmagic.version} us.codecraft webmagic-extension - 0.7.5 + ${webmagic.version} ``` diff --git a/README.md b/README.md index 14aeac7b1..750a76841 100644 --- a/README.md +++ b/README.md @@ -25,12 +25,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.7.5 + ${webmagic.version} us.codecraft webmagic-extension - 0.7.5 + ${webmagic.version} ``` From ef616c999e18bb9a7a351049749b3796d3abb977 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 27 Nov 2022 02:05:31 +0800 Subject: [PATCH 03/15] Fix warnings. --- .../webmagic/monitor/SpiderMonitor.java | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index b213dda94..50dbcaf1a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -1,21 +1,25 @@ package us.codecraft.webmagic.monitor; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.SpiderListener; -import us.codecraft.webmagic.utils.Experimental; -import us.codecraft.webmagic.utils.UrlUtils; - -import javax.management.*; import java.lang.management.ManagementFactory; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import javax.management.InstanceAlreadyExistsException; +import javax.management.JMException; +import javax.management.MBeanRegistrationException; +import javax.management.MBeanServer; +import javax.management.MalformedObjectNameException; +import javax.management.NotCompliantMBeanException; +import javax.management.ObjectName; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.SpiderListener; +import us.codecraft.webmagic.utils.Experimental; +import us.codecraft.webmagic.utils.UrlUtils; + /** * @author code4crafer@gmail.com * @since 0.5.0 @@ -23,17 +27,13 @@ @Experimental public class SpiderMonitor { - private static SpiderMonitor INSTANCE = new SpiderMonitor(); - - private AtomicBoolean started = new AtomicBoolean(false); - - private Logger logger = LoggerFactory.getLogger(getClass()); + private static final SpiderMonitor INSTANCE = new SpiderMonitor(); private MBeanServer mbeanServer; private String jmxServerName; - private List spiderStatuses = new ArrayList(); + private List spiderStatuses = new ArrayList<>(); protected SpiderMonitor() { jmxServerName = "WebMagic"; @@ -51,7 +51,7 @@ public synchronized SpiderMonitor register(Spider... spiders) throws JMException for (Spider spider : spiders) { MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener(); if (spider.getSpiderListeners() == null) { - List spiderListeners = new ArrayList(); + List spiderListeners = new ArrayList<>(); spiderListeners.add(monitorSpiderListener); spider.setSpiderListeners(spiderListeners); } else { @@ -90,7 +90,7 @@ public void onSuccess(Request request) { } @Override - public void onError(Request request) { + public void onError(Request request, Exception e) { errorUrls.add(request.getUrl()); errorCount.incrementAndGet(); } @@ -109,7 +109,6 @@ public List getErrorUrls() { } protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException { -// ObjectName objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName())); mbeanServer.registerMBean(spiderStatus, objName); } From 80424b0bd7242ae3f92055baabcedbf6e4a5913b Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Mon, 5 Dec 2022 23:26:01 +0800 Subject: [PATCH 04/15] Replace List with Iterable, fixed #1099. --- .../src/main/java/us/codecraft/webmagic/Page.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index c11df693c..6370171df 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -20,7 +20,7 @@ * {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
- * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch
+ * {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch
* * @author code4crafter@gmail.com
* @see us.codecraft.webmagic.downloader.Downloader @@ -52,7 +52,7 @@ public class Page { private List targetRequests = new ArrayList(); private String charset; - + public Page() { } @@ -108,7 +108,8 @@ public Json getJson() { * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ - public void setHtml(Html html) { + @Deprecated + public void setHtml(Html html) { this.html = html; } @@ -121,7 +122,7 @@ public List getTargetRequests() { * * @param requests requests */ - public void addTargetRequests(List requests) { + public void addTargetRequests(Iterable requests) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; @@ -137,7 +138,7 @@ public void addTargetRequests(List requests) { * @param requests requests * @param priority priority */ - public void addTargetRequests(List requests, long priority) { + public void addTargetRequests(Iterable requests, long priority) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; From a266df406ff4641d751c0607d203930fd0e7d7a5 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Tue, 20 Dec 2022 23:41:31 +0800 Subject: [PATCH 05/15] Add Site.defaultCharset. closes #1101. --- .../main/java/us/codecraft/webmagic/Site.java | 26 +++++++++++++++++++ .../downloader/HttpClientDownloader.java | 9 ++++--- .../java/us/codecraft/webmagic/SiteTest.java | 17 ++++++++++++ 3 files changed, 48 insertions(+), 4 deletions(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 4879b2825..230337756 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -28,6 +28,8 @@ public class Site { private String charset; + private String defaultCharset; + private int sleepTime = 5000; private int retryTimes = 0; @@ -168,6 +170,30 @@ public String getCharset() { return charset; } + /** + * Set default charset of page. + * + * When charset detect failed, use this default charset. + * + * @param defaultCharset the default charset + * @return this + * @since 0.9.0 + */ + public Site setDefaultCharset(String defaultCharset) { + this.defaultCharset = defaultCharset; + return this; + } + + /** + * The default charset if charset detected failed. + * + * @return the defulat charset + * @since 0.9.0 + */ + public String getDefaultCharset() { + return defaultCharset; + } + public int getTimeOut() { return timeOut; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 72821f3c1..bfd24f01c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -4,6 +4,7 @@ import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; @@ -116,7 +117,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http page.setBytes(bytes); if (!request.isBinaryContent()) { if (charset == null) { - charset = getHtmlCharset(contentType, bytes); + charset = getHtmlCharset(contentType, bytes, task); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); @@ -131,11 +132,11 @@ protected Page handleResponse(Request request, String charset, HttpResponse http return page; } - private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { + private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException { String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { - charset = Charset.defaultCharset().name(); - logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); + charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name); + logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset()); } return charset; } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java new file mode 100644 index 000000000..783b82ddc --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic; + +import static org.junit.Assert.assertEquals; + +import java.nio.charset.StandardCharsets; + +import org.junit.Test; + +public class SiteTest { + + @Test + public void test() { + Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name()); + assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset()); + } + +} From 12ce86425f4f5b09be06e49f0d19e84dfa10c54b Mon Sep 17 00:00:00 2001 From: hooyantsing Date: Fri, 3 Feb 2023 22:48:58 +0800 Subject: [PATCH 06/15] =?UTF-8?q?BugFix:=20Jsoup=20=E5=92=8C=20HtmlCleaner?= =?UTF-8?q?=20=E6=9E=84=E5=BB=BA=20Dom=20=E6=97=B6=EF=BC=8C=E8=8B=A5?= =?UTF-8?q?=E7=BC=BA=E5=A4=B1=20table=20=E6=A0=87=E7=AD=BE=EF=BC=8C?= =?UTF-8?q?=E5=88=99=E6=97=A0=E6=B3=95=E6=AD=A3=E5=B8=B8=E8=A7=A3=E6=9E=90?= =?UTF-8?q?=20tr=20=E5=92=8C=20td=20=E6=A0=87=E7=AD=BE=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../selector/BaseElementSelector.java | 10 ++----- .../webmagic/utils/BaseSelectorUtils.java | 23 +++++++++++++++ .../webmagic/selector/Xpath2Selector.java | 28 +++++++++++-------- .../webmagic/selector/XpathSelectorTest.java | 19 +++++++++++++ 4 files changed, 60 insertions(+), 20 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index b267d5ba9..6001767d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -3,6 +3,7 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import us.codecraft.webmagic.utils.BaseSelectorUtils; import java.util.ArrayList; import java.util.List; @@ -13,16 +14,9 @@ */ public abstract class BaseElementSelector implements Selector, ElementSelector { private Document parse(String text) { - if (text == null) { - return null; - } - // Jsoup could not parse or tag directly // https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag - if ((text.startsWith("") && text.endsWith("")) - || (text.startsWith("") && text.endsWith(""))) { - text = "" + text + "
"; - } + text = BaseSelectorUtils.preParse(text); return Jsoup.parse(text); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java new file mode 100644 index 000000000..04c0651c3 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.utils; + +/** + * @author hooy + */ +public class BaseSelectorUtils { + + /** + * Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly + * https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag + * + * @param text - the html string + * @return text + */ + public static String preParse(String text) { + if (((text.startsWith("") || text.startsWith("")) + || ((text.startsWith("") || text.startsWith(""))) { + text = "" + text + "
"; + } + return text; + } + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 9d5eef9b0..b63213b62 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -8,6 +8,7 @@ import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; +import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; @@ -29,13 +30,14 @@ import net.sf.saxon.lib.NamespaceConstant; import net.sf.saxon.xpath.XPathEvaluator; +import us.codecraft.webmagic.utils.BaseSelectorUtils; /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
* * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午9:39 + * Date: 13-4-21 + * Time: 上午9:39 */ public class Xpath2Selector implements Selector { @@ -111,14 +113,11 @@ private void init() throws XPathExpressionException { @Override public String select(String text) { try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); + result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); + result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); } if (result instanceof NodeList) { NodeList nodeList = (NodeList) result; @@ -147,14 +146,11 @@ public String select(String text) { public List selectList(String text) { List results = new ArrayList(); try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); + result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); + result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); } if (result instanceof NodeList) { NodeList nodeList = (NodeList) result; @@ -179,4 +175,12 @@ public List selectList(String text) { } return results; } + + private Document parse(String text) throws ParserConfigurationException { + // HtmlCleaner could not parse or tag directly + text = BaseSelectorUtils.preParse(text); + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + return new DomSerializer(new CleanerProperties()).createDOM(tagNode); + } } diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 166188361..c2025e7c6 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -11,6 +11,9 @@ import org.junit.Ignore; import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; @@ -1385,6 +1388,22 @@ public void testXpath2Selector() { Assert.assertEquals("http://www.oschina.net/",