Add some more watermark removal code

2021-11-17 16:17:30 +01:00 · 2021-11-17 16:17:30 +01:00 · 1b391da815
parent 1545d76803
commit 1b391da815
4 changed files with 275 additions and 54 deletions
--- a/DeDRM_plugin/init.py
+++ b/DeDRM_plugin/init.py
@ -90,12 +90,9 @@ PLUGIN_VERSION = ".".join([str(x)for x in PLUGIN_VERSION_TUPLE])
 RESOURCE_NAME = PLUGIN_NAME + '_Help.htm'
 import codecs
-import sys, os, re
+import sys, os
 import time
 import zipfile
 import traceback
 from zipfile import ZipInfo, ZipFile, ZIP_STORED, ZIP_DEFLATED
 from contextlib import closing
 class DeDRMError(Exception):
@ -211,55 +208,31 @@ class DeDRM(FileTypePlugin):
        # This is called after the DRM is removed (or if no DRM was present)
        # It does stuff like de-obfuscating fonts (by calling checkFonts) 
        # or removing watermarks. 
        path_to_ebook = self.checkFonts(path_to_ebook)
        path_to_ebook = self.removeCDPwatermarkFromEPUB(path_to_ebook)
        return path_to_ebook
    def removeCDPwatermarkFromEPUB(self, path_to_ebook):
        # "META-INF/cdp.info" is a watermark file used by some Tolino vendors. 
        # We don't want that in our eBooks, so lets remove that file.
        try: 
-            infile = ZipFile(open(path_to_ebook, 'rb'))
+            import calibre_plugins.dedrm.prefs as prefs
-            namelist = infile.namelist()
+            dedrmprefs = prefs.DeDRM_Prefs()
-            if 'META-INF/cdp.info' not in namelist:
+
            if dedrmprefs["deobfuscate_fonts"] is True:
                # Deobfuscate fonts
                path_to_ebook = self.checkFonts(path_to_ebook) or path_to_ebook
            if dedrmprefs["remove_watermarks"] is True:
                import calibre_plugins.dedrm.epubwatermark as watermark
                # Remove Tolino's CDP watermark file
                path_to_ebook = watermark.removeCDPwatermark(self, path_to_ebook) or path_to_ebook
                # Remove watermarks (currently just Amazon) from the OPF file
                path_to_ebook = watermark.removeOPFwatermarks(self, path_to_ebook) or path_to_ebook
                # Remove watermarks (currently just Adobe's resource ID) from all HTML and XHTML files
                path_to_ebook = watermark.removeHTMLwatermarks(self, path_to_ebook) or path_to_ebook
                return path_to_ebook
            namelist.remove("mimetype")
            namelist.remove("META-INF/cdp.info")
            output = self.temporary_file(".epub").name
            kwds = dict(compression=ZIP_DEFLATED, allowZip64=False)
            with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf:
                for path in (["mimetype"] + namelist):
                    data = infile.read(path)
                    zi = ZipInfo(path)
                    oldzi = infile.getinfo(path)
                    try: 
                        zi.compress_type = oldzi.compress_type
                        if path == "mimetype":
                            zi.compress_type = ZIP_STORED
                        zi.date_time = oldzi.date_time
                        zi.comment = oldzi.comment
                        zi.extra = oldzi.extra
                        zi.internal_attr = oldzi.internal_attr
                        zi.external_attr = oldzi.external_attr
                        zi.create_system = oldzi.create_system
                        if any(ord(c) >= 128 for c in path) or any(ord(c) >= 128 for c in zi.comment):
                            # If the file name or the comment contains any non-ASCII char, set the UTF8-flag
                            zi.flag_bits |= 0x800
                    except:
                        pass
                    outf.writestr(zi, data)
            print("{0} v{1}: Successfully removed cdp.info watermark".format(PLUGIN_NAME, PLUGIN_VERSION))
            return output
        except: 
            print("Error while checking settings")
            return path_to_ebook
    def checkFonts(self, path_to_ebook):
@ -267,10 +240,6 @@ class DeDRM(FileTypePlugin):
        # It checks if there's fonts that need to be deobfuscated
        try: 
            import calibre_plugins.dedrm.prefs as prefs
            dedrmprefs = prefs.DeDRM_Prefs()
            if dedrmprefs["deobfuscate_fonts"] is True:
                import calibre_plugins.dedrm.epubfontdecrypt as epubfontdecrypt
                output = self.temporary_file(".epub").name
@ -283,10 +252,10 @@ class DeDRM(FileTypePlugin):
                else:
                    print("{0} v{1}: Error during font deobfuscation".format(PLUGIN_NAME, PLUGIN_VERSION))
                    raise DeDRMError("Font deobfuscation failed")
-            else: 
+ 
                return path_to_ebook
        except: 
            print("{0} v{1}: Error during font deobfuscation".format(PLUGIN_NAME, PLUGIN_VERSION))
            traceback.print_exc()
            return path_to_ebook
    def ePubDecrypt(self,path_to_ebook):
--- a/DeDRM_plugin/config.py
+++ b/DeDRM_plugin/config.py
@ -83,6 +83,7 @@ class ConfigWidget(QWidget):
        self.tempdedrmprefs['adobewineprefix'] = self.dedrmprefs['adobewineprefix']
        self.tempdedrmprefs['kindlewineprefix'] = self.dedrmprefs['kindlewineprefix']
        self.tempdedrmprefs['deobfuscate_fonts'] = self.dedrmprefs['deobfuscate_fonts']
        self.tempdedrmprefs['remove_watermarks'] = self.dedrmprefs['remove_watermarks']
        # Start Qt Gui dialog layout
        layout = QVBoxLayout(self)
@ -146,6 +147,11 @@ class ConfigWidget(QWidget):
        self.chkFontObfuscation.setChecked(self.tempdedrmprefs["deobfuscate_fonts"])
        button_layout.addWidget(self.chkFontObfuscation)
        self.chkRemoveWatermarks = QtGui.QCheckBox(_("Remove watermarks"))
        self.chkRemoveWatermarks.setToolTip("Tries to remove watermarks from files")
        self.chkRemoveWatermarks.setChecked(self.tempdedrmprefs["remove_watermarks"])
        button_layout.addWidget(self.chkRemoveWatermarks)
        self.resize(self.sizeHint())
    def kindle_serials(self):
@ -209,6 +215,7 @@ class ConfigWidget(QWidget):
        self.dedrmprefs.set('kindlewineprefix', self.tempdedrmprefs['kindlewineprefix'])
        self.dedrmprefs.set('configured', True)
        self.dedrmprefs.set('deobfuscate_fonts', self.chkFontObfuscation.isChecked())
        self.dedrmprefs.set('remove_watermarks', self.chkRemoveWatermarks.isChecked())
        self.dedrmprefs.writeprefs()
    def load_resource(self, name):
--- a/DeDRM_plugin/epubwatermark.py
+++ b/DeDRM_plugin/epubwatermark.py
@ -0,0 +1,244 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # epubwatermark.py
 # Copyright © 2021 NoDRM
 # Revision history:
 #  1.0   - Initial version
 # Released under the terms of the GNU General Public Licence, version 3
 # <http://www.gnu.org/licenses/>
 """
 Removes various watermarks from EPUB files
 """
 import traceback
 from zipfile import ZipInfo, ZipFile, ZIP_STORED, ZIP_DEFLATED
 from contextlib import closing
 from lxml import etree
 import re
 # Runs a RegEx over all HTML/XHTML files to remove watermakrs.
 def removeHTMLwatermarks(object, path_to_ebook):
    try: 
        inf = ZipFile(open(path_to_ebook, 'rb'))
        namelist = inf.namelist()
        modded_names = []
        modded_contents = []
        for file in namelist:
            if not (file.endswith('.html') or file.endswith('.xhtml')):
                continue
            try:
                file_str = inf.read(file).decode("utf-8")
                str_new = file_str
                # Remove Adobe ADEPT watermarks
                # Match optional newline at the beginning, then a "meta" tag with name = "Adept.expected.resource" or "Adept.resource"
                # and either a "value" or a "content" element with an Adobe UUID
                str_new = re.sub(r'((\r\n|\r|\n)\s*)?\<meta\s+name=\"(Adept\.resource|Adept\.expected\.resource)\"\s+(content|value)=\"urn:uuid:[0-9a-fA-F\-]+\"\s*\/>', '', str_new)
                str_new = re.sub(r'((\r\n|\r|\n)\s*)?\<meta\s+(content|value)=\"urn:uuid:[0-9a-fA-F\-]+\"\s+name=\"(Adept\.resource|Adept\.expected\.resource)\"\s*\/>', '', str_new)
            except:
                traceback.print_exc()
                continue
            if (file_str == str_new):
                continue
            modded_names.append(file)
            modded_contents.append(str_new)
        if len(modded_names) == 0:
            # No file modified, return original
            return path_to_ebook
        if len(modded_names) != len(modded_contents):
            # Something went terribly wrong, return original
            print("Watermark: Error during ADEPT watermark removal")
            return path_to_ebook
        # Re-package with modified files:
        namelist.remove("mimetype")
        try: 
            output = object.temporary_file(".epub").name
            kwds = dict(compression=ZIP_DEFLATED, allowZip64=False)
            with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf:
                for path in (["mimetype"] + namelist):
                    data = inf.read(path)
                    try: 
                        modded_index = None
                        modded_index = modded_names.index(path)
                    except:
                        pass
                    if modded_index is not None:
                        # Found modified file - replace contents
                        data = modded_contents[modded_index]
                    zi = ZipInfo(path)
                    oldzi = inf.getinfo(path)
                    try: 
                        zi.compress_type = oldzi.compress_type
                        if path == "mimetype":
                            zi.compress_type = ZIP_STORED
                        zi.date_time = oldzi.date_time
                        zi.comment = oldzi.comment
                        zi.extra = oldzi.extra
                        zi.internal_attr = oldzi.internal_attr
                        zi.external_attr = oldzi.external_attr
                        zi.create_system = oldzi.create_system
                        if any(ord(c) >= 128 for c in path) or any(ord(c) >= 128 for c in zi.comment):
                            # If the file name or the comment contains any non-ASCII char, set the UTF8-flag
                            zi.flag_bits |= 0x800
                    except:
                        pass
                    outf.writestr(zi, data)
        except:
            traceback.print_exc()
            return path_to_ebook
    except:
        traceback.print_exc()
        return path_to_ebook
    print("Watermark: Successfully stripped {0} ADEPT watermark(s) from ebook.".format(len(modded_names)))
    return output
 # Finds the main OPF file, then uses RegEx to remove watermarks
 def removeOPFwatermarks(object, path_to_ebook):
    contNS = lambda tag: '{%s}%s' % ('urn:oasis:names:tc:opendocument:xmlns:container', tag)
    opf_path = None
    try:
        inf = ZipFile(open(path_to_ebook, 'rb'))
        container = etree.fromstring(inf.read("META-INF/container.xml"))
        rootfiles = container.find(contNS("rootfiles")).findall(contNS("rootfile"))
        for rootfile in rootfiles: 
            opf_path = rootfile.get("full-path", None)
            if (opf_path is not None):
                break
    except: 
        traceback.print_exc()
        return path_to_ebook
    # If path is None, we didn't find an OPF, so we probably don't have a font key.
    # If path is set, it's the path to the main content OPF file.
    if (opf_path is None):
        # No OPF found - no watermark
        return path_to_ebook
    else:
        try:
            container_str = inf.read(opf_path).decode("utf-8")
            container_str_new = container_str
            # Remove Amazon hex watermarks
            # Match optional newline at the beginning, then spaces, then a "meta" tag with name = "Watermark" or "Watermark_(hex)" and a "content" element.
            container_str_new = re.sub(r'((\r\n|\r|\n)\s*)?\<meta\s+name=\"Watermark(_\(hex\))?\"\s+content=\"[0-9a-fA-F]+\"\s*\/>', '', container_str_new)
            container_str_new = re.sub(r'((\r\n|\r|\n)\s*)?\<meta\s+content=\"[0-9a-fA-F]+\"\s+name=\"Watermark(_\(hex\))?\"\s*\/>', '', container_str_new)
        except:
            traceback.print_exc()
            return path_to_ebook
        if (container_str == container_str_new):
            # container didn't change - no watermark
            return path_to_ebook
        # Re-package without watermark
        namelist = inf.namelist()
        namelist.remove("mimetype")
        try: 
            output = object.temporary_file(".epub").name
            kwds = dict(compression=ZIP_DEFLATED, allowZip64=False)
            with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf:
                for path in (["mimetype"] + namelist):
                    data = inf.read(path)
                    if path == opf_path:
                        # Found OPF, replacing ...
                        data = container_str_new
                    zi = ZipInfo(path)
                    oldzi = inf.getinfo(path)
                    try: 
                        zi.compress_type = oldzi.compress_type
                        if path == "mimetype":
                            zi.compress_type = ZIP_STORED
                        zi.date_time = oldzi.date_time
                        zi.comment = oldzi.comment
                        zi.extra = oldzi.extra
                        zi.internal_attr = oldzi.internal_attr
                        zi.external_attr = oldzi.external_attr
                        zi.create_system = oldzi.create_system
                        if any(ord(c) >= 128 for c in path) or any(ord(c) >= 128 for c in zi.comment):
                            # If the file name or the comment contains any non-ASCII char, set the UTF8-flag
                            zi.flag_bits |= 0x800
                    except:
                        pass
                    outf.writestr(zi, data)
        except:
            traceback.print_exc()
            return path_to_ebook
        print("Watermark: Successfully stripped Amazon watermark from OPF file.")
        return output
 def removeCDPwatermark(object, path_to_ebook):
    # "META-INF/cdp.info" is a watermark file used by some Tolino vendors. 
    # We don't want that in our eBooks, so lets remove that file.
    try: 
        infile = ZipFile(open(path_to_ebook, 'rb'))
        namelist = infile.namelist()
        if 'META-INF/cdp.info' not in namelist:
            return path_to_ebook
        namelist.remove("mimetype")
        namelist.remove("META-INF/cdp.info")
        output = object.temporary_file(".epub").name
        kwds = dict(compression=ZIP_DEFLATED, allowZip64=False)
        with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf:
            for path in (["mimetype"] + namelist):
                data = infile.read(path)
                zi = ZipInfo(path)
                oldzi = infile.getinfo(path)
                try: 
                    zi.compress_type = oldzi.compress_type
                    if path == "mimetype":
                        zi.compress_type = ZIP_STORED
                    zi.date_time = oldzi.date_time
                    zi.comment = oldzi.comment
                    zi.extra = oldzi.extra
                    zi.internal_attr = oldzi.internal_attr
                    zi.external_attr = oldzi.external_attr
                    zi.create_system = oldzi.create_system
                    if any(ord(c) >= 128 for c in path) or any(ord(c) >= 128 for c in zi.comment):
                        # If the file name or the comment contains any non-ASCII char, set the UTF8-flag
                        zi.flag_bits |= 0x800
                except:
                    pass
                outf.writestr(zi, data)
        print("Watermark: Successfully removed cdp.info watermark")
        return output
    except: 
        traceback.print_exc()
        return path_to_ebook
--- a/DeDRM_plugin/prefs.py
+++ b/DeDRM_plugin/prefs.py
@ -20,6 +20,7 @@ class DeDRM_Prefs():
        self.dedrmprefs.defaults['configured'] = False
        self.dedrmprefs.defaults['deobfuscate_fonts'] = True
        self.dedrmprefs.defaults['remove_watermarks'] = False
        self.dedrmprefs.defaults['bandnkeys'] = {}
        self.dedrmprefs.defaults['adeptkeys'] = {}
        self.dedrmprefs.defaults['ereaderkeys'] = {}