Readability.js update

2020-04-05 15:37:44 +07:00 · 2020-04-05 15:37:44 +07:00 · d3539ec9d7
parent 338f01be01
commit d3539ec9d7
2 changed files with 99 additions and 1 deletions
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
  </component>
 </project>
--- a/lib/Readability.js
+++ b/lib/Readability.js
@ -1316,6 +1316,101 @@ Readability.prototype = {
    return metadata;
  },

+  /**
+   * Check if node is image, or if node contains exactly only one image
+   * whether as a direct child or as its descendants.
+   *
+   * @param Element
+   **/
+  _isSingleImage: function(node) {
+    if (node.tagName === "IMG") {
+      return true;
+    }
+
+    if (node.children.length !== 1 || node.textContent.trim() !== "") {
+      return false;
+    }
+
+    return this._isSingleImage(node.children[0]);
+  },
+
+  /**
+   * Find all <noscript> that are located after <img> nodes, and which contain only one
+   * <img> element. Replace the first image with the image from inside the <noscript> tag,
+   * and remove the <noscript> tag. This improves the quality of the images we use on
+   * some sites (e.g. Medium).
+   *
+   * @param Element
+   **/
+  _unwrapNoscriptImages: function(doc) {
+    // Find img without source or attributes that might contains image, and remove it.
+    // This is done to prevent a placeholder img is replaced by img from noscript in next step.
+    var imgs = Array.from(doc.getElementsByTagName("img"));
+    this._forEachNode(imgs, function(img) {
+      for (var i = 0; i < img.attributes.length; i++) {
+        var attr = img.attributes[i];
+        switch (attr.name) {
+          case "src":
+          case "srcset":
+          case "data-src":
+          case "data-srcset":
+            return;
+        }
+
+        if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
+          return;
+        }
+      }
+
+      img.parentNode.removeChild(img);
+    });
+
+    // Next find noscript and try to extract its image
+    var noscripts = Array.from(doc.getElementsByTagName("noscript"));
+    this._forEachNode(noscripts, function(noscript) {
+      // Parse content of noscript and make sure it only contains image
+      var tmp = doc.createElement("div");
+      tmp.innerHTML = noscript.innerHTML;
+      if (!this._isSingleImage(tmp)) {
+        return;
+      }
+
+      // If noscript has previous sibling and it only contains image,
+      // replace it with noscript content. However we also keep old
+      // attributes that might contains image.
+      var prevElement = noscript.previousElementSibling;
+      if (prevElement && this._isSingleImage(prevElement)) {
+        var prevImg = prevElement;
+        if (prevImg.tagName !== "IMG") {
+          prevImg = prevElement.getElementsByTagName("img")[0];
+        }
+
+        var newImg = tmp.getElementsByTagName("img")[0];
+        for (var i = 0; i < prevImg.attributes.length; i++) {
+          var attr = prevImg.attributes[i];
+          if (attr.value === "") {
+            continue;
+          }
+
+          if (attr.name === "src" || attr.name === "srcset" || /\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
+            if (newImg.getAttribute(attr.name) === attr.value) {
+              continue;
+            }
+
+            var attrName = attr.name;
+            if (newImg.hasAttribute(attrName)) {
+              attrName = "data-old-" + attrName;
+            }
+
+            newImg.setAttribute(attrName, attr.value);
+          }
+        }
+
+        noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
+      }
+    });
+  },
+
  /**
   * Removes script tags from the document.
   *
@ -1828,6 +1923,9 @@ Readability.prototype = {
      }
    }

+    // Unwrap image from noscript
+    this._unwrapNoscriptImages(this._doc);
+
    // Remove script tags from the document.
    this._removeScripts(this._doc);