|
|
|
|
@ -1316,6 +1316,101 @@ Readability.prototype = {
|
|
|
|
|
return metadata;
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Check if node is image, or if node contains exactly only one image
|
|
|
|
|
* whether as a direct child or as its descendants.
|
|
|
|
|
*
|
|
|
|
|
* @param Element
|
|
|
|
|
**/
|
|
|
|
|
_isSingleImage: function(node) {
|
|
|
|
|
if (node.tagName === "IMG") {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (node.children.length !== 1 || node.textContent.trim() !== "") {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return this._isSingleImage(node.children[0]);
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Find all <noscript> that are located after <img> nodes, and which contain only one
|
|
|
|
|
* <img> element. Replace the first image with the image from inside the <noscript> tag,
|
|
|
|
|
* and remove the <noscript> tag. This improves the quality of the images we use on
|
|
|
|
|
* some sites (e.g. Medium).
|
|
|
|
|
*
|
|
|
|
|
* @param Element
|
|
|
|
|
**/
|
|
|
|
|
_unwrapNoscriptImages: function(doc) {
|
|
|
|
|
// Find img without source or attributes that might contains image, and remove it.
|
|
|
|
|
// This is done to prevent a placeholder img is replaced by img from noscript in next step.
|
|
|
|
|
var imgs = Array.from(doc.getElementsByTagName("img"));
|
|
|
|
|
this._forEachNode(imgs, function(img) {
|
|
|
|
|
for (var i = 0; i < img.attributes.length; i++) {
|
|
|
|
|
var attr = img.attributes[i];
|
|
|
|
|
switch (attr.name) {
|
|
|
|
|
case "src":
|
|
|
|
|
case "srcset":
|
|
|
|
|
case "data-src":
|
|
|
|
|
case "data-srcset":
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
img.parentNode.removeChild(img);
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Next find noscript and try to extract its image
|
|
|
|
|
var noscripts = Array.from(doc.getElementsByTagName("noscript"));
|
|
|
|
|
this._forEachNode(noscripts, function(noscript) {
|
|
|
|
|
// Parse content of noscript and make sure it only contains image
|
|
|
|
|
var tmp = doc.createElement("div");
|
|
|
|
|
tmp.innerHTML = noscript.innerHTML;
|
|
|
|
|
if (!this._isSingleImage(tmp)) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// If noscript has previous sibling and it only contains image,
|
|
|
|
|
// replace it with noscript content. However we also keep old
|
|
|
|
|
// attributes that might contains image.
|
|
|
|
|
var prevElement = noscript.previousElementSibling;
|
|
|
|
|
if (prevElement && this._isSingleImage(prevElement)) {
|
|
|
|
|
var prevImg = prevElement;
|
|
|
|
|
if (prevImg.tagName !== "IMG") {
|
|
|
|
|
prevImg = prevElement.getElementsByTagName("img")[0];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var newImg = tmp.getElementsByTagName("img")[0];
|
|
|
|
|
for (var i = 0; i < prevImg.attributes.length; i++) {
|
|
|
|
|
var attr = prevImg.attributes[i];
|
|
|
|
|
if (attr.value === "") {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (attr.name === "src" || attr.name === "srcset" || /\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
|
|
|
|
|
if (newImg.getAttribute(attr.name) === attr.value) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var attrName = attr.name;
|
|
|
|
|
if (newImg.hasAttribute(attrName)) {
|
|
|
|
|
attrName = "data-old-" + attrName;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
newImg.setAttribute(attrName, attr.value);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Removes script tags from the document.
|
|
|
|
|
*
|
|
|
|
|
@ -1828,6 +1923,9 @@ Readability.prototype = {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Unwrap image from noscript
|
|
|
|
|
this._unwrapNoscriptImages(this._doc);
|
|
|
|
|
|
|
|
|
|
// Remove script tags from the document.
|
|
|
|
|
this._removeScripts(this._doc);
|
|
|
|
|
|
|
|
|
|
|