Add some more watermark removal code

2021-11-17 16:17:30 +01:00 · 2021-11-17 16:17:30 +01:00 · 1b391da815
parent 1545d76803
commit 1b391da815
4 changed files with 275 additions and 54 deletions
--- a/DeDRM_plugin/init.py
+++ b/DeDRM_plugin/init.py
@ -90,12 +90,9 @@ PLUGIN_VERSION = ".".join([str(x)for x in PLUGIN_VERSION_TUPLE])
 RESOURCE_NAME = PLUGIN_NAME + '_Help.htm'

 import codecs
-import sys, os, re
+import sys, os
 import time
-import zipfile
 import traceback
-from zipfile import ZipInfo, ZipFile, ZIP_STORED, ZIP_DEFLATED
-from contextlib import closing


 class DeDRMError(Exception):
@ -211,55 +208,31 @@ class DeDRM(FileTypePlugin):
        # This is called after the DRM is removed (or if no DRM was present)
        # It does stuff like de-obfuscating fonts (by calling checkFonts) 
        # or removing watermarks. 
-        path_to_ebook = self.checkFonts(path_to_ebook)
-        path_to_ebook = self.removeCDPwatermarkFromEPUB(path_to_ebook)
+
+        try: 
+            import calibre_plugins.dedrm.prefs as prefs
+            dedrmprefs = prefs.DeDRM_Prefs()
+
+            if dedrmprefs["deobfuscate_fonts"] is True:
+                # Deobfuscate fonts
+                path_to_ebook = self.checkFonts(path_to_ebook) or path_to_ebook
+
+            if dedrmprefs["remove_watermarks"] is True:
+                import calibre_plugins.dedrm.epubwatermark as watermark
+
+                # Remove Tolino's CDP watermark file
+                path_to_ebook = watermark.removeCDPwatermark(self, path_to_ebook) or path_to_ebook
+
+                # Remove watermarks (currently just Amazon) from the OPF file
+                path_to_ebook = watermark.removeOPFwatermarks(self, path_to_ebook) or path_to_ebook
+                    
+                # Remove watermarks (currently just Adobe's resource ID) from all HTML and XHTML files
+                path_to_ebook = watermark.removeHTMLwatermarks(self, path_to_ebook) or path_to_ebook
            
                return path_to_ebook

-    def removeCDPwatermarkFromEPUB(self, path_to_ebook):
-        # "META-INF/cdp.info" is a watermark file used by some Tolino vendors. 
-        # We don't want that in our eBooks, so lets remove that file.
-        try: 
-            infile = ZipFile(open(path_to_ebook, 'rb'))
-            namelist = infile.namelist()
-            if 'META-INF/cdp.info' not in namelist:
-                return path_to_ebook
-
-            namelist.remove("mimetype")
-            namelist.remove("META-INF/cdp.info")
-
-            output = self.temporary_file(".epub").name
-
-            kwds = dict(compression=ZIP_DEFLATED, allowZip64=False)
-            with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf:
-                for path in (["mimetype"] + namelist):
-
-                    data = infile.read(path)
-                    
-                    zi = ZipInfo(path)
-                    oldzi = infile.getinfo(path)
-                    try: 
-                        zi.compress_type = oldzi.compress_type
-                        if path == "mimetype":
-                            zi.compress_type = ZIP_STORED
-                        zi.date_time = oldzi.date_time
-                        zi.comment = oldzi.comment
-                        zi.extra = oldzi.extra
-                        zi.internal_attr = oldzi.internal_attr
-                        zi.external_attr = oldzi.external_attr
-                        zi.create_system = oldzi.create_system
-                        if any(ord(c) >= 128 for c in path) or any(ord(c) >= 128 for c in zi.comment):
-                            # If the file name or the comment contains any non-ASCII char, set the UTF8-flag
-                            zi.flag_bits |= 0x800
-                    except:
-                        pass
-
-                    outf.writestr(zi, data)
-            
-            print("{0} v{1}: Successfully removed cdp.info watermark".format(PLUGIN_NAME, PLUGIN_VERSION))
-            return output
-
        except: 
+            print("Error while checking settings")
            return path_to_ebook

    def checkFonts(self, path_to_ebook):
@ -267,10 +240,6 @@ class DeDRM(FileTypePlugin):
        # It checks if there's fonts that need to be deobfuscated

        try: 
-            import calibre_plugins.dedrm.prefs as prefs
-            dedrmprefs = prefs.DeDRM_Prefs()
-
-            if dedrmprefs["deobfuscate_fonts"] is True:
                import calibre_plugins.dedrm.epubfontdecrypt as epubfontdecrypt

                output = self.temporary_file(".epub").name
@ -283,10 +252,10 @@ class DeDRM(FileTypePlugin):
                else:
                    print("{0} v{1}: Error during font deobfuscation".format(PLUGIN_NAME, PLUGIN_VERSION))
                    raise DeDRMError("Font deobfuscation failed")
-            else: 
-                return path_to_ebook
+ 
        except: 
            print("{0} v{1}: Error during font deobfuscation".format(PLUGIN_NAME, PLUGIN_VERSION))
+            traceback.print_exc()
            return path_to_ebook

    def ePubDecrypt(self,path_to_ebook):
--- a/DeDRM_plugin/config.py
+++ b/DeDRM_plugin/config.py
@ -83,6 +83,7 @@ class ConfigWidget(QWidget):
        self.tempdedrmprefs['adobewineprefix'] = self.dedrmprefs['adobewineprefix']
        self.tempdedrmprefs['kindlewineprefix'] = self.dedrmprefs['kindlewineprefix']
        self.tempdedrmprefs['deobfuscate_fonts'] = self.dedrmprefs['deobfuscate_fonts']
+        self.tempdedrmprefs['remove_watermarks'] = self.dedrmprefs['remove_watermarks']

        # Start Qt Gui dialog layout
        layout = QVBoxLayout(self)
@ -146,6 +147,11 @@ class ConfigWidget(QWidget):
        self.chkFontObfuscation.setChecked(self.tempdedrmprefs["deobfuscate_fonts"])
        button_layout.addWidget(self.chkFontObfuscation)

+        self.chkRemoveWatermarks = QtGui.QCheckBox(_("Remove watermarks"))
+        self.chkRemoveWatermarks.setToolTip("Tries to remove watermarks from files")
+        self.chkRemoveWatermarks.setChecked(self.tempdedrmprefs["remove_watermarks"])
+        button_layout.addWidget(self.chkRemoveWatermarks)
+
        self.resize(self.sizeHint())

    def kindle_serials(self):
@ -209,6 +215,7 @@ class ConfigWidget(QWidget):
        self.dedrmprefs.set('kindlewineprefix', self.tempdedrmprefs['kindlewineprefix'])
        self.dedrmprefs.set('configured', True)
        self.dedrmprefs.set('deobfuscate_fonts', self.chkFontObfuscation.isChecked())
+        self.dedrmprefs.set('remove_watermarks', self.chkRemoveWatermarks.isChecked())
        self.dedrmprefs.writeprefs()

    def load_resource(self, name):
--- a/DeDRM_plugin/epubwatermark.py
+++ b/DeDRM_plugin/epubwatermark.py
@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# epubwatermark.py
+# Copyright © 2021 NoDRM
+
+# Revision history:
+#  1.0   - Initial version
+
+# Released under the terms of the GNU General Public Licence, version 3
+# <http://www.gnu.org/licenses/>
+
+"""
+Removes various watermarks from EPUB files
+"""
+
+import traceback
+from zipfile import ZipInfo, ZipFile, ZIP_STORED, ZIP_DEFLATED
+from contextlib import closing
+from lxml import etree
+import re
+
+# Runs a RegEx over all HTML/XHTML files to remove watermakrs.
+def removeHTMLwatermarks(object, path_to_ebook):
+    try: 
+        inf = ZipFile(open(path_to_ebook, 'rb'))
+        namelist = inf.namelist()
+
+        modded_names = []
+        modded_contents = []
+
+        for file in namelist:
+            if not (file.endswith('.html') or file.endswith('.xhtml')):
+                continue
+
+            try:
+                file_str = inf.read(file).decode("utf-8")
+                str_new = file_str
+
+                # Remove Adobe ADEPT watermarks
+                # Match optional newline at the beginning, then a "meta" tag with name = "Adept.expected.resource" or "Adept.resource"
+                # and either a "value" or a "content" element with an Adobe UUID
+                str_new = re.sub(r'((\r\n|\r|\n)\s*)?\<meta\s+name=\"(Adept\.resource|Adept\.expected\.resource)\"\s+(content|value)=\"urn:uuid:[0-9a-fA-F\-]+\"\s*\/>', '', str_new)
+                str_new = re.sub(r'((\r\n|\r|\n)\s*)?\<meta\s+(content|value)=\"urn:uuid:[0-9a-fA-F\-]+\"\s+name=\"(Adept\.resource|Adept\.expected\.resource)\"\s*\/>', '', str_new)
+            except:
+                traceback.print_exc()
+                continue
+
+            if (file_str == str_new):
+                continue
+
+            modded_names.append(file)
+            modded_contents.append(str_new)
+        
+        if len(modded_names) == 0:
+            # No file modified, return original
+            return path_to_ebook
+
+        if len(modded_names) != len(modded_contents):
+            # Something went terribly wrong, return original
+            print("Watermark: Error during ADEPT watermark removal")
+            return path_to_ebook
+
+        # Re-package with modified files:
+        namelist.remove("mimetype")
+
+        try: 
+            output = object.temporary_file(".epub").name
+            kwds = dict(compression=ZIP_DEFLATED, allowZip64=False)
+            with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf:
+                for path in (["mimetype"] + namelist):
+
+                    data = inf.read(path)
+                    
+                    try: 
+                        modded_index = None
+                        modded_index = modded_names.index(path)
+                    except:
+                        pass
+
+                    if modded_index is not None:
+                        # Found modified file - replace contents
+                        data = modded_contents[modded_index]
+
+                    zi = ZipInfo(path)
+                    oldzi = inf.getinfo(path)
+                    try: 
+                        zi.compress_type = oldzi.compress_type
+                        if path == "mimetype":
+                            zi.compress_type = ZIP_STORED
+                        zi.date_time = oldzi.date_time
+                        zi.comment = oldzi.comment
+                        zi.extra = oldzi.extra
+                        zi.internal_attr = oldzi.internal_attr
+                        zi.external_attr = oldzi.external_attr
+                        zi.create_system = oldzi.create_system
+                        if any(ord(c) >= 128 for c in path) or any(ord(c) >= 128 for c in zi.comment):
+                            # If the file name or the comment contains any non-ASCII char, set the UTF8-flag
+                            zi.flag_bits |= 0x800
+                    except:
+                        pass
+
+                    outf.writestr(zi, data)
+        except:
+            traceback.print_exc()
+            return path_to_ebook
+
+    except:
+        traceback.print_exc()
+        return path_to_ebook
+        
+    print("Watermark: Successfully stripped {0} ADEPT watermark(s) from ebook.".format(len(modded_names)))
+    return output
+
+
+# Finds the main OPF file, then uses RegEx to remove watermarks
+def removeOPFwatermarks(object, path_to_ebook):
+    contNS = lambda tag: '{%s}%s' % ('urn:oasis:names:tc:opendocument:xmlns:container', tag)
+    opf_path = None
+
+    try:
+        inf = ZipFile(open(path_to_ebook, 'rb'))
+        container = etree.fromstring(inf.read("META-INF/container.xml"))
+        rootfiles = container.find(contNS("rootfiles")).findall(contNS("rootfile"))
+        for rootfile in rootfiles: 
+            opf_path = rootfile.get("full-path", None)
+            if (opf_path is not None):
+                break
+    except: 
+        traceback.print_exc()
+        return path_to_ebook
+
+    # If path is None, we didn't find an OPF, so we probably don't have a font key.
+    # If path is set, it's the path to the main content OPF file.
+
+    if (opf_path is None):
+        # No OPF found - no watermark
+        return path_to_ebook
+    else:
+        try:
+            container_str = inf.read(opf_path).decode("utf-8")
+            container_str_new = container_str
+
+            # Remove Amazon hex watermarks
+            # Match optional newline at the beginning, then spaces, then a "meta" tag with name = "Watermark" or "Watermark_(hex)" and a "content" element.
+            container_str_new = re.sub(r'((\r\n|\r|\n)\s*)?\<meta\s+name=\"Watermark(_\(hex\))?\"\s+content=\"[0-9a-fA-F]+\"\s*\/>', '', container_str_new)
+            container_str_new = re.sub(r'((\r\n|\r|\n)\s*)?\<meta\s+content=\"[0-9a-fA-F]+\"\s+name=\"Watermark(_\(hex\))?\"\s*\/>', '', container_str_new)
+        except:
+            traceback.print_exc()
+            return path_to_ebook
+
+        if (container_str == container_str_new):
+            # container didn't change - no watermark
+            return path_to_ebook
+
+        # Re-package without watermark
+        namelist = inf.namelist()
+        namelist.remove("mimetype")
+
+        try: 
+            output = object.temporary_file(".epub").name
+            kwds = dict(compression=ZIP_DEFLATED, allowZip64=False)
+            with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf:
+                for path in (["mimetype"] + namelist):
+
+                    data = inf.read(path)
+                    if path == opf_path:
+                        # Found OPF, replacing ...
+                        data = container_str_new
+
+                    zi = ZipInfo(path)
+                    oldzi = inf.getinfo(path)
+                    try: 
+                        zi.compress_type = oldzi.compress_type
+                        if path == "mimetype":
+                            zi.compress_type = ZIP_STORED
+                        zi.date_time = oldzi.date_time
+                        zi.comment = oldzi.comment
+                        zi.extra = oldzi.extra
+                        zi.internal_attr = oldzi.internal_attr
+                        zi.external_attr = oldzi.external_attr
+                        zi.create_system = oldzi.create_system
+                        if any(ord(c) >= 128 for c in path) or any(ord(c) >= 128 for c in zi.comment):
+                            # If the file name or the comment contains any non-ASCII char, set the UTF8-flag
+                            zi.flag_bits |= 0x800
+                    except:
+                        pass
+
+                    outf.writestr(zi, data)
+        except:
+            traceback.print_exc()
+            return path_to_ebook
+        
+        print("Watermark: Successfully stripped Amazon watermark from OPF file.")
+        return output
+
+
+
+def removeCDPwatermark(object, path_to_ebook):
+    # "META-INF/cdp.info" is a watermark file used by some Tolino vendors. 
+    # We don't want that in our eBooks, so lets remove that file.
+    try: 
+        infile = ZipFile(open(path_to_ebook, 'rb'))
+        namelist = infile.namelist()
+        if 'META-INF/cdp.info' not in namelist:
+            return path_to_ebook
+
+        namelist.remove("mimetype")
+        namelist.remove("META-INF/cdp.info")
+
+        output = object.temporary_file(".epub").name
+
+        kwds = dict(compression=ZIP_DEFLATED, allowZip64=False)
+        with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf:
+            for path in (["mimetype"] + namelist):
+
+                data = infile.read(path)
+                
+                zi = ZipInfo(path)
+                oldzi = infile.getinfo(path)
+                try: 
+                    zi.compress_type = oldzi.compress_type
+                    if path == "mimetype":
+                        zi.compress_type = ZIP_STORED
+                    zi.date_time = oldzi.date_time
+                    zi.comment = oldzi.comment
+                    zi.extra = oldzi.extra
+                    zi.internal_attr = oldzi.internal_attr
+                    zi.external_attr = oldzi.external_attr
+                    zi.create_system = oldzi.create_system
+                    if any(ord(c) >= 128 for c in path) or any(ord(c) >= 128 for c in zi.comment):
+                        # If the file name or the comment contains any non-ASCII char, set the UTF8-flag
+                        zi.flag_bits |= 0x800
+                except:
+                    pass
+
+                outf.writestr(zi, data)
+        
+        print("Watermark: Successfully removed cdp.info watermark")
+        return output
+
+    except: 
+        traceback.print_exc()
+        return path_to_ebook
--- a/DeDRM_plugin/prefs.py
+++ b/DeDRM_plugin/prefs.py
@ -20,6 +20,7 @@ class DeDRM_Prefs():

        self.dedrmprefs.defaults['configured'] = False
        self.dedrmprefs.defaults['deobfuscate_fonts'] = True
+        self.dedrmprefs.defaults['remove_watermarks'] = False
        self.dedrmprefs.defaults['bandnkeys'] = {}
        self.dedrmprefs.defaults['adeptkeys'] = {}
        self.dedrmprefs.defaults['ereaderkeys'] = {}