diff --git a/CHANGES.md b/CHANGES.md index 26414db7..645cd31b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -3,6 +3,7 @@ UNRELEASED * Fixed regression error in harvest calendar from VUE2 to VUE3 upgrade. +* Meta refresh tag (almost works as 302 MOVED), is now also url replaced since serviceworker do not catch this. Closing #https://github.com/netarchivesuite/solrwayback/issues/490 5.4.1 ----- diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/HtmlParserUrlRewriter.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/HtmlParserUrlRewriter.java index 8a8bc628..3286b4db 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/HtmlParserUrlRewriter.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/HtmlParserUrlRewriter.java @@ -27,33 +27,40 @@ // TODO: Refactor to extend RewriterBase for better re-use public class HtmlParserUrlRewriter { - private static final Logger log = LoggerFactory.getLogger(HtmlParserUrlRewriter.class); + private static final Logger log = LoggerFactory.getLogger(HtmlParserUrlRewriter.class); + + //Problem is jsoup text(...) or html(...) encodes & and for the the HTML urls must not be encoded. (blame HTML standard) + //So it can not be set using JSOUP and must be replaced after. + private static final String AMPERSAND_REPLACE="_STYLE_AMPERSAND_REPLACE_"; + + private static final String CSS_IMPORT_PATTERN_STRING = + "(?s)\\s*@import\\s+(?:url)?[(]?\\s*['\"]?([^'\")]*\\.css[^'\") ]*)['\"]?\\s*[)]?.*"; + private static Pattern CSS_IMPORT_PATTERN = Pattern.compile(CSS_IMPORT_PATTERN_STRING); + + + //example: + //URL can also be lowercase + private static String META_REFRESH_URL_PATTERN_STRING ="(?i)(]*http-equiv\\s*=\\s*[\"']refresh[\"'][^>]*content\\s*=\\s*[\"'][^\"'>]*;\\s*url=)([^\"'>\\s]+)"; + private static Pattern META_REFRESH_URL_PATTERN = Pattern.compile(META_REFRESH_URL_PATTERN_STRING); + - //Problem is jsoup text(...) or html(...) encodes & and for the the HTML urls must not be encoded. (blame HTML standard) - //So it can not be set using JSOUP and must be replaced after. - private static final String AMPERSAND_REPLACE="_STYLE_AMPERSAND_REPLACE_"; - - private static final String CSS_IMPORT_PATTERN_STRING = - "(?s)\\s*@import\\s+(?:url)?[(]?\\s*['\"]?([^'\")]*\\.css[^'\") ]*)['\"]?\\s*[)]?.*"; - private static Pattern CSS_IMPORT_PATTERN = Pattern.compile(CSS_IMPORT_PATTERN_STRING); - - private static final String CSS_IMPORT_PATTERN_STRING2 = - "(?s)\\s*@import\\s+(?:url)?[(]?\\s*['\"]?([^'\")]*\\.css[^'\") ]*)['\"]?\\s*[)]?"; - private static Pattern CSS_IMPORT_PATTERN2 = Pattern.compile(CSS_IMPORT_PATTERN_STRING2); - - private static Pattern STYLE_ELEMENT_BACKGROUND_PATTERN = Pattern.compile( - "background(?:-image)?\\s*:([^;}]*)"); - private static Pattern CSS_URL_PATTERN = Pattern.compile( - "url\\s*\\(\\s*[\"']?([^)\"']*)[\"']?\\s*\\)"); - - //replacing urls that points into the world outside solrwayback because they are never harvested + private static final String CSS_IMPORT_PATTERN_STRING2 = + "(?s)\\s*@import\\s+(?:url)?[(]?\\s*['\"]?([^'\")]*\\.css[^'\") ]*)['\"]?\\s*[)]?"; + private static Pattern CSS_IMPORT_PATTERN2 = Pattern.compile(CSS_IMPORT_PATTERN_STRING2); + + private static Pattern STYLE_ELEMENT_BACKGROUND_PATTERN = Pattern.compile( + "background(?:-image)?\\s*:([^;}]*)"); + private static Pattern CSS_URL_PATTERN = Pattern.compile( + "url\\s*\\(\\s*[\"']?([^)\"']*)[\"']?\\s*\\)"); + + //replacing urls that points into the world outside solrwayback because they are never harvested public static final String NOT_FOUND_LINK = PropertiesLoader.WAYBACK_BASEURL + "services/notfound/"; - public static void main(String[] args) throws Exception{ -// String css= new String(Files.readAllBytes(Paths.get("/home/teg/gamespot.css"))); + public static void main(String[] args) throws Exception{ + // String css= new String(Files.readAllBytes(Paths.get("/home/teg/gamespot.css"))); -// System.out.println(css); -/* + // System.out.println(css); + /* String[] result = css.split("\n", 100); //Get the lines starting with @import until you find one that does not. int index = 0; @@ -73,135 +80,154 @@ public static void main(String[] args) throws Exception{ System.out.println(" "); System.exit(1); -*/ -// System.exit(1); + */ + // System.exit(1); - } + } + + /* CSS can start with the following and need to be url rewritten also. + * @import "mystyle.css"; + * @import url(slidearrows.css); + * @import url(shadow_frames.css) print; + */ + // TODO: Switch to RegexpReplacer for this + public static String replaceLinksCss(ArcEntry arc) throws Exception{ + + String type="downloadRaw"; //not supporting nested @imports... + String css = arc.getStringContentAsStringSafe(); + String url=arc.getUrl(); + + String[] result = css.split("\n", 100); //Doubt there will be more than 100 of these. + //Get the lines starting with @import until you find one that does not. + int index = 0; + + while (index NetarchiveSolrClient.getInstance().findNearestUrlsShort(urls, timeStamp, lenient), - startMS); - } - - /** - * Replaces links and other URLs with the archived versions that are closest to the links in the html in time. - * @param html the web page to use as basis for replacing links. - * @param url the URL for the html (needed for resolving relative links). - * @param crawlDate the ideal timestamp for the archived versions to link to. - * @param nearestResolver handles url -> archived-resource lookups based on smallest temporal distance to crawlDate. - * @throws Exception if link resolving failed. - */ - public static ParseResult replaceLinks( - String html, String url, String crawlDate, NearestResolver nearestResolver) throws Exception { - return replaceLinks(html, url, crawlDate, nearestResolver, System.currentTimeMillis()); - } - // startMS used to measure total time, including resolving of the HTML - private static ParseResult replaceLinks( - String html, String url, String crawlDate, - NearestResolver nearestResolver, long startMS) throws Exception { - final long preReplaceMS = System.currentTimeMillis()-startMS; - long replaceMS = -System.currentTimeMillis(); - - final String waybackDate = DateUtils.convertUtcDate2WaybackDate(crawlDate); - Document doc = Jsoup.parse(html, url); - - // Collect URLs and resolve archived versions for them - Set urlSet = getUrlResourcesForHtmlPage(doc, url); - log.debug("#unique urlset to resolve for arc-url '" + url + "' :" + urlSet.size()); - - long resolveMS = -System.currentTimeMillis(); - List docs = nearestResolver.findNearestHarvestTime(urlSet, crawlDate); - resolveMS += System.currentTimeMillis(); - - // Rewriting to url_norm, so it can be matched when replacing. - final CountingMap urlReplaceMap = new CountingMap<>(); - for (IndexDocShort indexDoc: docs){ - urlReplaceMap.put(indexDoc.getUrl_norm(), indexDoc); - } + /** + * This method will replace the meta tag refresh + * Example: See META_REFRESH_URL_PATTERN_STRING in top + * This pattern does not follow solrwayback framework for replacement. The value here for content is '0;URL=http://www.mhs.no/norsk.shtml' + * The url is not contained in each own tag as an attribute + */ + public static String replaceMetaRefreshForHtml(String html,String waybackDate) throws Exception{ + Matcher matcher = META_REFRESH_URL_PATTERN.matcher(html); + if (matcher.find()) { + // Group 2 is the original URL + String oldUrl = matcher.group(2); + String newUrl=PropertiesLoader.WAYBACK_BASEURL+"services/web/"+waybackDate+"/"+oldUrl; + html = matcher.replaceAll("$1"+newUrl); + } + return html; + } + + /** + * Extracts the HTML from the ArcEntry and replaces links and other URLs with the archived versions that are + * closest to the ArcEntry in time. + * @param arc an arc-entry that is expected to be a HTML page. + * @param lenient if true, lenient URL-matching is used. + * See {@link dk.kb.netarchivesuite.solrwayback.util.UrlUtils#lenientURLQuery(String)}. + * @return the page with links to archived versions instead of live web version. + * @throws Exception if link-resolving failed. + */ + public static ParseResult replaceLinks(ArcEntry arc, boolean lenient) throws Exception{ + final long startMS = System.currentTimeMillis(); + return replaceLinks( + arc.getStringContentAsStringSafe(), arc.getUrl(), arc.getCrawlDate(), + (urls, timeStamp) -> NetarchiveSolrClient.getInstance().findNearestUrlsShort(urls, timeStamp, lenient), + startMS); + } + + /** + * Replaces links and other URLs with the archived versions that are closest to the links in the html in time. + * @param html the web page to use as basis for replacing links. + * @param url the URL for the html (needed for resolving relative links). + * @param crawlDate the ideal timestamp for the archived versions to link to. + * @param nearestResolver handles url -> archived-resource lookups based on smallest temporal distance to crawlDate. + * @throws Exception if link resolving failed. + */ + public static ParseResult replaceLinks( + String html, String url, String crawlDate, NearestResolver nearestResolver) throws Exception { + return replaceLinks(html, url, crawlDate, nearestResolver, System.currentTimeMillis()); + } + // startMS used to measure total time, including resolving of the HTML + private static ParseResult replaceLinks( + String html, String url, String crawlDate, + NearestResolver nearestResolver, long startMS) throws Exception { + final long preReplaceMS = System.currentTimeMillis()-startMS; + long replaceMS = -System.currentTimeMillis(); + + final String waybackDate = DateUtils.convertUtcDate2WaybackDate(crawlDate); + Document doc = Jsoup.parse(html, url); + + // Collect URLs and resolve archived versions for them + Set urlSet = getUrlResourcesForHtmlPage(doc, url); + log.debug("#unique urlset to resolve for arc-url '" + url + "' :" + urlSet.size()); + + long resolveMS = -System.currentTimeMillis(); + List docs = nearestResolver.findNearestHarvestTime(urlSet, crawlDate); + resolveMS += System.currentTimeMillis(); + + // Rewriting to url_norm, so it can be matched when replacing. + final CountingMap urlReplaceMap = new CountingMap<>(); + for (IndexDocShort indexDoc: docs){ + urlReplaceMap.put(indexDoc.getUrl_norm(), indexDoc); + } // Replace URLs in the document with URLs for archived versions. - UnaryOperator rewriterRaw = createTransformer( - urlReplaceMap, "downloadRaw", ""); + UnaryOperator rewriterRaw = createTransformer( + urlReplaceMap, "downloadRaw", ""); processElement(doc, "img", "abs:src", rewriterRaw); processElement(doc, "img", "abs:data-src", rewriterRaw); // JQuery convention used for delayed loading of images processElement(doc, "embed", "abs:src", rewriterRaw); processElement(doc, "source", "abs:src", rewriterRaw); - processElement(doc, "script", "abs:src", rewriterRaw); - processElement(doc, "body", "abs:background", rewriterRaw); - processElement(doc, "table", "abs:background", rewriterRaw); - processElement(doc, "td", "abs:background", rewriterRaw); - - // link elements are mostly used to reference stylesheets, which must be transformed before use - UnaryOperator rewriterView = createTransformer( - urlReplaceMap, "view", ""); - processElement(doc, "link", "abs:href", rewriterView); - - // Don't show SolrWayback bar in frames - UnaryOperator rewriterViewNoBar = createTransformer( - urlReplaceMap, "view", "&showToolbar=false"); - processElement(doc, "frame", "abs:src", rewriterViewNoBar); + processElement(doc, "script", "abs:src", rewriterRaw); + processElement(doc, "body", "abs:background", rewriterRaw); + processElement(doc, "table", "abs:background", rewriterRaw); + processElement(doc, "td", "abs:background", rewriterRaw); + + // link elements are mostly used to reference stylesheets, which must be transformed before use + UnaryOperator rewriterView = createTransformer( + urlReplaceMap, "view", ""); + processElement(doc, "link", "abs:href", rewriterView); + + // Don't show SolrWayback bar in frames + UnaryOperator rewriterViewNoBar = createTransformer( + urlReplaceMap, "view", "&showToolbar=false"); + processElement(doc, "frame", "abs:src", rewriterViewNoBar); processElement(doc, "iframe", "abs:src", rewriterViewNoBar); - // Links to external resources are not resolved until clicked - UnaryOperator rewriterRawNoResolve = (sourceURL) -> - PropertiesLoader.WAYBACK_BASEURL + "services/web/" + waybackDate + "/" + sourceURL; + // Links to external resources are not resolved until clicked + UnaryOperator rewriterRawNoResolve = (sourceURL) -> + PropertiesLoader.WAYBACK_BASEURL + "services/web/" + waybackDate + "/" + sourceURL; processElement(doc, "a", "abs:href", rewriterRawNoResolve); processElement(doc, "area", "abs:href", rewriterRawNoResolve); processElement(doc, "form", "abs:action", rewriterRawNoResolve); @@ -209,84 +235,88 @@ private static ParseResult replaceLinks( // Multi value elements processMultiAttribute(doc, "img", "srcset", rewriterRaw); processMultiAttribute(doc, "img", "data-srcset", rewriterRaw); - processMultiAttribute(doc, "source", "srcset", rewriterRaw); + processMultiAttribute(doc, "source", "srcset", rewriterRaw); - // Full content processing - // TODO: Why the raw rewrite? Shouldn't this be view? - UnaryOperator rewriterRawAmpersand = (sourceURL) -> { - sourceURL = rewriterRaw.apply(sourceURL); - return sourceURL == null ? null : sourceURL.replace("&", AMPERSAND_REPLACE); - }; - // TODO: Move this to ScriptRewriter - processElementRegexp(doc, "style", null, rewriterRawAmpersand, CSS_IMPORT_PATTERN2); + // Full content processing + // TODO: Why the raw rewrite? Shouldn't this be view? + UnaryOperator rewriterRawAmpersand = (sourceURL) -> { + sourceURL = rewriterRaw.apply(sourceURL); + return sourceURL == null ? null : sourceURL.replace("&", AMPERSAND_REPLACE); + }; + + + processElementRegexp(doc, "meta", null, rewriterRawAmpersand, META_REFRESH_URL_PATTERN); - processElementRegexp(doc, "*", "style", rewriterRaw, STYLE_ELEMENT_BACKGROUND_PATTERN, CSS_URL_PATTERN); + // TODO: Move this to ScriptRewriter + processElementRegexp(doc, "style", null, rewriterRawAmpersand, CSS_IMPORT_PATTERN2); - // Script content is handled by ScriptRewriter - rewriteInlineScripts(doc, crawlDate, urlReplaceMap); + processElementRegexp(doc, "*", "style", rewriterRaw, STYLE_ELEMENT_BACKGROUND_PATTERN, CSS_URL_PATTERN); - replaceMS += System.currentTimeMillis(); - /* + // Script content is handled by ScriptRewriter + rewriteInlineScripts(doc, crawlDate, urlReplaceMap); + + replaceMS += System.currentTimeMillis(); + /* log.debug(String.format( "replaceLinks('%s', %s): Links unique=%d, replaced=%d, not_found=%d. " + "Time total=%dms (resolveHTML=%dms, analysis+adjustment=%dms, resolveResources=%dms)", url, crawlDate, urlSet.size(), urlReplaceMap.getFoundCount(), urlReplaceMap.getFailCount(), preReplaceMS+replaceMS, preReplaceMS, replaceMS-resolveMS, resolveMS)); - */ - - String html_output= doc.toString(); - html_output = RewriterBase.unescape(html_output); - - ParseResult res = new ParseResult(); - res.setReplaced(html_output); - res.setNumberOfLinksReplaced(urlReplaceMap.getFoundCount()); - res.setNumberOfLinksNotFound(urlReplaceMap.getFailCount()); - return res; - } - - private static void rewriteInlineScripts( - Document doc, String crawlDate, Map urlReplaceMap) { - processElement(doc, "script", null, (content) -> { - try { - ParseResult scriptResult = ScriptRewriter.getInstance().replaceLinks( - content, doc.baseUri(), crawlDate, urlReplaceMap, RewriterBase.PACKAGING.inline, true); - return scriptResult.getReplaced(); - } catch (Exception e) { - log.warn("Exception while parsing inline script for " + doc.baseUri() + " " + crawlDate, e); - return content; - } - }); - } - - /** - * Generic transformer creator that normalises the incoming URL and return a link to an archived version, - * if such a version exists. Else a {@code notfound} link is returned. - * If the URL us a {@code data:} URL, it is returned unmodified. - * @param urlReplaceMap a map of archived versions for normalised URLs on the page. - * @param type view or downloadRAW. - * @param extraParams optional extra parameters for the URL to return. - * @return an URL to an archived version of the resource that the URL designates or a {@code notfound} URL. - */ - private static UnaryOperator createTransformer( + */ + + String html_output= doc.toString(); + html_output = RewriterBase.unescape(html_output); + + ParseResult res = new ParseResult(); + res.setReplaced(html_output); + res.setNumberOfLinksReplaced(urlReplaceMap.getFoundCount()); + res.setNumberOfLinksNotFound(urlReplaceMap.getFailCount()); + return res; + } + + private static void rewriteInlineScripts( + Document doc, String crawlDate, Map urlReplaceMap) { + processElement(doc, "script", null, (content) -> { + try { + ParseResult scriptResult = ScriptRewriter.getInstance().replaceLinks( + content, doc.baseUri(), crawlDate, urlReplaceMap, RewriterBase.PACKAGING.inline, true); + return scriptResult.getReplaced(); + } catch (Exception e) { + log.warn("Exception while parsing inline script for " + doc.baseUri() + " " + crawlDate, e); + return content; + } + }); + } + + /** + * Generic transformer creator that normalises the incoming URL and return a link to an archived version, + * if such a version exists. Else a {@code notfound} link is returned. + * If the URL us a {@code data:} URL, it is returned unmodified. + * @param urlReplaceMap a map of archived versions for normalised URLs on the page. + * @param type view or downloadRAW. + * @param extraParams optional extra parameters for the URL to return. + * @return an URL to an archived version of the resource that the URL designates or a {@code notfound} URL. + */ + private static UnaryOperator createTransformer( Map urlReplaceMap, String type, String extraParams) { - return (String sourceURL) -> { - if (sourceURL.startsWith("data:")) { - return sourceURL; - } - sourceURL = sourceURL.replace("/../", "/"); - sourceURL = sourceURL.replace("/../", "/"); - - IndexDocShort indexDoc = urlReplaceMap.get(Normalisation.canonicaliseURL(sourceURL)); - if (indexDoc != null){ - return PropertiesLoader.WAYBACK_BASEURL + "services/" + type + - "?source_file_path=" + indexDoc.getSource_file_path() + - "&offset=" + indexDoc.getOffset() + - (extraParams == null ? "" : extraParams); - } - log.debug("No harvest found for:"+sourceURL); - return NOT_FOUND_LINK; - }; - } + return (String sourceURL) -> { + if (sourceURL.startsWith("data:")) { + return sourceURL; + } + sourceURL = sourceURL.replace("/../", "/"); + sourceURL = sourceURL.replace("/../", "/"); + + IndexDocShort indexDoc = urlReplaceMap.get(Normalisation.canonicaliseURL(sourceURL)); + if (indexDoc != null){ + return PropertiesLoader.WAYBACK_BASEURL + "services/" + type + + "?source_file_path=" + indexDoc.getSource_file_path() + + "&offset=" + indexDoc.getOffset() + + (extraParams == null ? "" : extraParams); + } + log.debug("No harvest found for:"+sourceURL); + return NOT_FOUND_LINK; + }; + } /** * Collect URLs for resources on the page, intended for later replacement with links to archived versions. @@ -294,8 +324,8 @@ private static UnaryOperator createTransformer( * @param baseURL baseURL for the web page, used for resolving relative URLs. * @return a Set of URLs found on the page. */ - public static HashSet getUrlResourcesForHtmlPage(Document doc, String baseURL) { - URLAbsoluter absoluter = new URLAbsoluter(baseURL, true); + public static HashSet getUrlResourcesForHtmlPage(Document doc, String baseURL) { + URLAbsoluter absoluter = new URLAbsoluter(baseURL, true); final HashSet urlSet = new HashSet<>(); UnaryOperator collector = (String sourceURL) -> { urlSet.add(absoluter.apply(sourceURL)); @@ -306,159 +336,160 @@ public static HashSet getUrlResourcesForHtmlPage(Document doc, String ba processElement(doc, "img", "abs:data-src", collector); // JQuery convention used for delayed loading of images processElement(doc, "embed", "abs:src", collector); processElement(doc, "source", "abs:src", collector); - processElement(doc, "script", "abs:src", collector); + processElement(doc, "script", "abs:src", collector); - processElement(doc, "body", "abs:background", collector); - processElement(doc, "td", "abs:background", collector); - processElement(doc, "table", "abs:background", collector); - processElement(doc, "area", "abs:href", collector); // Why is this collected? It is not replaced later on + processElement(doc, "body", "abs:background", collector); + processElement(doc, "td", "abs:background", collector); + processElement(doc, "table", "abs:background", collector); + processElement(doc, "area", "abs:href", collector); // Why is this collected? It is not replaced later on - processElement(doc, "link", "abs:href", collector); + processElement(doc, "link", "abs:href", collector); - processElement(doc, "frame", "abs:src", collector); + processElement(doc, "frame", "abs:src", collector); processElement(doc, "iframe", "abs:src", collector); - - processMultiAttribute(doc, "img", "srcset", collector); - processMultiAttribute(doc, "img", "data-srcset", collector); - processMultiAttribute(doc, "source", "srcset", collector); - processElementRegexp(doc, "style", null, collector, CSS_IMPORT_PATTERN2); - processElementRegexp(doc, "*", "style", collector, STYLE_ELEMENT_BACKGROUND_PATTERN, CSS_URL_PATTERN); + processMultiAttribute(doc, "img", "srcset", collector); + processMultiAttribute(doc, "img", "data-srcset", collector); + processMultiAttribute(doc, "source", "srcset", collector); - // Get URLs from the ScriptRewriter - processElement(doc, "script", null, (content) -> { - urlSet.addAll(ScriptRewriter.getInstance().getResourceURLs(content, baseURL)); - return null; - }); + processElementRegexp(doc, "style", null, collector, CSS_IMPORT_PATTERN2); + processElementRegexp(doc, "meta", null, collector, META_REFRESH_URL_PATTERN); + processElementRegexp(doc, "*", "style", collector, STYLE_ELEMENT_BACKGROUND_PATTERN, CSS_URL_PATTERN); + + // Get URLs from the ScriptRewriter + processElement(doc, "script", null, (content) -> { + urlSet.addAll(ScriptRewriter.getInstance().getResourceURLs(content, baseURL)); + return null; + }); return urlSet; - } + } + + public static String generatePwid(ArcEntry arc) throws Exception{ - public static String generatePwid(ArcEntry arc) throws Exception{ + long start = System.currentTimeMillis(); + String html = arc.getStringContentAsStringSafe(); + String url=arc.getUrl(); - long start = System.currentTimeMillis(); - String html = arc.getStringContentAsStringSafe(); - String url=arc.getUrl(); + String collectionName = PropertiesLoader.PID_COLLECTION_NAME; + // TODO: Switch to streaming based parsing with limit on input size + Document doc = Jsoup.parse(html, url); - String collectionName = PropertiesLoader.PID_COLLECTION_NAME; - // TODO: Switch to streaming based parsing with limit on input size - Document doc = Jsoup.parse(html, url); + HashSet urlSet = getUrlResourcesForHtmlPage(doc, url); - HashSet urlSet = getUrlResourcesForHtmlPage(doc, url); - - log.info("#unique urlset to resolve:"+urlSet.size()); + log.info("#unique urlset to resolve:"+urlSet.size()); - StringBuffer buf = new StringBuffer(); - NetarchiveSolrClient.getInstance().findNearestHarvestTimeForMultipleUrlsFewFields(urlSet,arc.getCrawlDate()). - forEach(shortDoc -> { - buf.append("\n"); - buf.append("urn:pwid:"+collectionName+":"+shortDoc.getCrawlDate()+":part:"+shortDoc.getUrl() +"\n"); - buf.append("\n"); - //pwid:netarkivet.dk:time:part:url - }); - return buf.toString(); - } + StringBuffer buf = new StringBuffer(); + NetarchiveSolrClient.getInstance().findNearestHarvestTimeForMultipleUrlsFewFields(urlSet,arc.getCrawlDate()). + forEach(shortDoc -> { + buf.append("\n"); + buf.append("urn:pwid:"+collectionName+":"+shortDoc.getCrawlDate()+":part:"+shortDoc.getUrl() +"\n"); + buf.append("\n"); + //pwid:netarkivet.dk:time:part:url + }); + return buf.toString(); + } + - - public static HashSet getResourceLinksForHtmlFromArc(ArcEntry arc) throws Exception{ + public static HashSet getResourceLinksForHtmlFromArc(ArcEntry arc) throws Exception{ - String html = arc.getStringContentAsStringSafe(); + String html = arc.getStringContentAsStringSafe(); - String url=arc.getUrl(); + String url=arc.getUrl(); - // TODO: Switch to streaming based parsing with limit on input size - Document doc = Jsoup.parse(html,url); + // TODO: Switch to streaming based parsing with limit on input size + Document doc = Jsoup.parse(html,url); - - HashSet urlSet = getUrlResourcesForHtmlPage(doc, url); - - return urlSet; + + HashSet urlSet = getUrlResourcesForHtmlPage(doc, url); + + return urlSet; } - /** - * Resolves instances of documents based on time distance. - */ - public interface NearestResolver { - /** - * Locates one instance of each url, as close to timeStamp as possible. - * @param urls the URLs to resolve. - * @param isoTime a timestamp formatted as {@code YYYY-MM-ddTHH:MM:SSZ}. - * @return IndexDocs for the located URLs containing at least - * {@code url_norm, url, source_file, source_file_offset} for each document. - */ - List findNearestHarvestTime(Collection urls, String isoTime) throws Exception; - } - - - /** - * Iterates all matching element+attribute, then all outerRegexp {@code .group(1)}-matching content is applied to - * innerRegexp and the group(1)-matches from that is sent throught the transformer. - * Note1: The content of the matching innerRegexp group 1 is expected to be an URL and will be made absolute before - * being transformed. - * Note2: Content returned from transformer will be entity encoded by JSOUP, if attribute is null. - * If an ampersand {@code &} is to remain non-encoded, replace it with {@link #AMPERSAND_REPLACE} in the - * content before returning it. - * @param doc a JSOUP document, representing part on a HTML page. - * @param element an HTML element. - * @param attribute an attribute for the HTML element. -* If the attribute is null, the content of the element is used. - * @param regexps the content of the matching nodes will be matched by the first regexp and {@code .group(1)} - * will be fed to the next regexp and so forth. When there are no more regexps, the content - * will be processed by transformer. - * @param transformer takes the regexp matching content of the attribute and provides the new content. - * If null is returned, the content will not be changed. - */ - public static void processElementRegexp( - Document doc, String element, String attribute, UnaryOperator transformer, Pattern... regexps) { - final URLAbsoluter absoluter = new URLAbsoluter(doc.baseUri(), true); - UnaryOperator processor = url -> - // TODO: Should canonicalization not be the responsibility of the collector? - transformer.apply(absoluter.apply(url)); - for (int i = regexps.length-1 ; i >= 0 ; i--) { - processor = new RegexpReplacer(regexps[i], processor); - } - processElement(doc, element, attribute, processor); - } + /** + * Resolves instances of documents based on time distance. + */ + public interface NearestResolver { + /** + * Locates one instance of each url, as close to timeStamp as possible. + * @param urls the URLs to resolve. + * @param isoTime a timestamp formatted as {@code YYYY-MM-ddTHH:MM:SSZ}. + * @return IndexDocs for the located URLs containing at least + * {@code url_norm, url, source_file, source_file_offset} for each document. + */ + List findNearestHarvestTime(Collection urls, String isoTime) throws Exception; + } + + + /** + * Iterates all matching element+attribute, then all outerRegexp {@code .group(1)}-matching content is applied to + * innerRegexp and the group(1)-matches from that is sent throught the transformer. + * Note1: The content of the matching innerRegexp group 1 is expected to be an URL and will be made absolute before + * being transformed. + * Note2: Content returned from transformer will be entity encoded by JSOUP, if attribute is null. + * If an ampersand {@code &} is to remain non-encoded, replace it with {@link #AMPERSAND_REPLACE} in the + * content before returning it. + * @param doc a JSOUP document, representing part on a HTML page. + * @param element an HTML element. + * @param attribute an attribute for the HTML element. + * If the attribute is null, the content of the element is used. + * @param regexps the content of the matching nodes will be matched by the first regexp and {@code .group(1)} + * will be fed to the next regexp and so forth. When there are no more regexps, the content + * will be processed by transformer. + * @param transformer takes the regexp matching content of the attribute and provides the new content. + * If null is returned, the content will not be changed. + */ + public static void processElementRegexp( + Document doc, String element, String attribute, UnaryOperator transformer, Pattern... regexps) { + final URLAbsoluter absoluter = new URLAbsoluter(doc.baseUri(), true); + UnaryOperator processor = url -> + // TODO: Should canonicalization not be the responsibility of the collector? + transformer.apply(absoluter.apply(url)); + for (int i = regexps.length-1 ; i >= 0 ; i--) { + processor = new RegexpReplacer(regexps[i], processor); + } + processElement(doc, element, attribute, processor); + } /** * Iterates all matching element+attribute and applies the transformer on the content. - * Expects URLs extracted from the attribute to be delivered as absolute by JSOUP. - * Note: If the attribute is null, the content of the element will be used. When assigning new content to the - * element, it will be entity escaped. + * Expects URLs extracted from the attribute to be delivered as absolute by JSOUP. + * Note: If the attribute is null, the content of the element will be used. When assigning new content to the + * element, it will be entity escaped. * @param doc a JSOUP document, representing part on a HTML page. * @param element an HTML element. * @param attribute an attribute for the HTML element. - * If the attribute is null, the content of the element is used. - * If the attribute is prefixed with {@code abs:}, JSOUP will attempt to make is an absolute URL. + * If the attribute is null, the content of the element is used. + * If the attribute is prefixed with {@code abs:}, JSOUP will attempt to make is an absolute URL. * @param transformer takes the content of the attribute and provides the new content. * If null is returned, the content will not be changed. */ - public static void processElement( - Document doc, String element, String attribute, UnaryOperator transformer) { - for (Element e : doc.select(element)) { - String content = attribute == null || attribute.isEmpty() ? e.data() : e.attr(attribute); - if (content == null || content.trim().isEmpty()){ - continue; - } + public static void processElement( + Document doc, String element, String attribute, UnaryOperator transformer) { + for (Element e : doc.select(element)) { + String content = attribute == null || attribute.isEmpty() ? e.data() : e.attr(attribute); + if (content == null || content.trim().isEmpty()){ + continue; + } String newContent = transformer.apply(content); - if (newContent != null && !newContent.equals(content)) { - if (attribute == null || attribute.isEmpty()) { - e.html(newContent.replace("\n", RewriterBase.NEWLINE_PLACEHOLDER)); - } else { - e.attr(attribute.replaceFirst("abs:", ""), newContent); - } - } - } - } + if (newContent != null && !newContent.equals(content)) { + if (attribute == null || attribute.isEmpty()) { + e.html(newContent.replace("\n", RewriterBase.NEWLINE_PLACEHOLDER)); + } else { + e.attr(attribute.replaceFirst("abs:", ""), newContent); + } + } + } + } /** * Iterates all matching element+attribute, splits the content on {@code ,} and subsequently {@code } (space), - * applying the transformer on the extracted content. - * If the urls in the content are not absolute then {@code doc.baseUri()} is used for making it absolute. + * applying the transformer on the extracted content. + * If the urls in the content are not absolute then {@code doc.baseUri()} is used for making it absolute. * @param doc a JSOUP document, representing part on a HTML page. * @param element an HTML element. * @param attribute an attribute for the HTML element. @@ -466,15 +497,15 @@ public static void processElement( * If null is returned, the content will not be changed. */ - public static void processMultiAttribute( - Document doc, String element, String attribute, UnaryOperator transformer) { - URLAbsoluter absoluter = new URLAbsoluter(doc.baseUri(), false); - processElementRegexp(doc, element, attribute, - url ->transformer.apply(absoluter.apply(url)), - COMMA_SEPARATED_PATTERN, SPACE_SEPARATED_PATTERN); - } - private static final Pattern COMMA_SEPARATED_PATTERN = Pattern.compile("([^,]+),?"); - private static final Pattern SPACE_SEPARATED_PATTERN = Pattern.compile("([^ ]+) ?.*"); + public static void processMultiAttribute( + Document doc, String element, String attribute, UnaryOperator transformer) { + URLAbsoluter absoluter = new URLAbsoluter(doc.baseUri(), false); + processElementRegexp(doc, element, attribute, + url ->transformer.apply(absoluter.apply(url)), + COMMA_SEPARATED_PATTERN, SPACE_SEPARATED_PATTERN); + } + private static final Pattern COMMA_SEPARATED_PATTERN = Pattern.compile("([^,]+),?"); + private static final Pattern SPACE_SEPARATED_PATTERN = Pattern.compile("([^ ]+) ?.*"); } diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/playback/HtmlPlayback.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/playback/HtmlPlayback.java index daf43047..a1b821b9 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/playback/HtmlPlayback.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/playback/HtmlPlayback.java @@ -25,7 +25,10 @@ public ArcEntry playback(boolean lenient) throws Exception{ ParseResult htmlReplaced = HtmlParserUrlRewriter.replaceLinks(arc, lenient); - String textReplaced=htmlReplaced.getReplaced(); + String textReplaced=htmlReplaced.getReplaced(); + //Meta content special parse since it does not follow the nomral replacement rules. The URL value is part of a whole attribut value. + textReplaced=HtmlParserUrlRewriter.replaceMetaRefreshForHtml(textReplaced,arc.getWaybackDate()); + htmlReplaced.setReplaced(textReplaced); boolean xhtml = doc.getContentType().toLowerCase().contains("application/xhtml"); //Inject tooolbar @@ -33,7 +36,7 @@ public ArcEntry playback(boolean lenient) throws Exception{ textReplaced = WaybackToolbarInjecter.injectWaybacktoolBar(doc.getSource_file_path(),doc.getOffset(),htmlReplaced , xhtml); } - arc.setStringContent(textReplaced); + arc.setStringContent(textReplaced); log.info("Generating webpage total processing:"+(System.currentTimeMillis()-start) + " "+doc.getSource_file_path()+ " "+ doc.getOffset() +" "+arc.getUrl()); arc.setHasBeenDecompressed(true); diff --git a/src/test/java/dk/kb/netarchivesuite/solrwayback/parsers/HtmlParserUrlRewriterTest.java b/src/test/java/dk/kb/netarchivesuite/solrwayback/parsers/HtmlParserUrlRewriterTest.java index 5f06120d..4ee5b049 100644 --- a/src/test/java/dk/kb/netarchivesuite/solrwayback/parsers/HtmlParserUrlRewriterTest.java +++ b/src/test/java/dk/kb/netarchivesuite/solrwayback/parsers/HtmlParserUrlRewriterTest.java @@ -99,6 +99,21 @@ public void testScriptRewriting() throws Exception { assertRewrite("script", 0); } + + + + @Test + public void testMetaContentRefreshUrl() throws Exception{ + String html = + ""+ + "" + +"Test metaContentRefesh"; + String replaced = HtmlParserUrlRewriter.replaceMetaRefreshForHtml(html, "20230907191706"); + String newUrl="url=http://localhost:0000/solrwayback/services/web/20230907191706/http://www.domainsomething.dk/index.shtml"; + assertTrue(replaced.toLowerCase().indexOf(newUrl) >0); + } + + @Test public void testEncodingRewriting() throws Exception { assertRewrite("encoding", 0);