+#!/usr/bin/env python3
+#
+# pdfstrip.py - strip objects from PDF files by specifying objects IDs
+#
+# Copyright (C) 2016 Antonio Ospite <ao2@ao2.it>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import argparse
+import logging
+import sys
+
+from pdfrw import PdfReader, PdfWriter
+from pdfrw.objects.pdfindirect import PdfIndirect
+
+
+# pylint: disable=invalid-name
+logger = logging.getLogger(__name__)
+console_handler = logging.StreamHandler()
+formatter = logging.Formatter('[%(levelname)s] %(funcName)s:%(lineno)d %(message)s')
+console_handler.setFormatter(formatter)
+logger.addHandler(console_handler)
+logger.propagate = False
+
+
+def resolve_objects_names(pdf, objects_ids):
+ logger.debug("objects_ids %s", objects_ids)
+ objects_names = []
+ for objnum in objects_ids:
+ indirect_object = pdf.findindirect(objnum, 0)
+ if isinstance(indirect_object, PdfIndirect):
+ try:
+ real_object = indirect_object.real_value()
+ if real_object.Name:
+ objects_names.append(real_object.Name)
+ else:
+ logger.warning("Object %d has an empty 'Name' attribute", objnum)
+ except AttributeError:
+ logger.warning("Object %d has no 'Name' attribute", objnum)
+ else:
+ logger.warning("Object %d is not a PdfIndirect but a %s",
+ objnum, type(indirect_object))
+
+ logger.debug("objects_names %s\n", objects_names)
+ return objects_names
+
+
+def strip_objects(pdf, objects_names):
+ for i, page in enumerate(pdf.pages):
+ logger.debug("Page %d", i + 1)
+ logger.debug("Before %s", page.Resources.XObject.keys())
+ for obj in objects_names:
+ if obj in page.Resources.XObject:
+ del page.Resources.XObject[obj]
+
+ logger.debug("After %s\n", page.Resources.XObject.keys())
+
+ return pdf
+
+
+def validate_objects_ids(objects_ids_string):
+ try:
+ objects_ids = [int(obj) for obj in objects_ids_string.split(',')]
+ except (IndexError, ValueError):
+ raise argparse.ArgumentTypeError("%s contains an invalid value" % objects_ids_string)
+
+ return objects_ids
+
+
+def main():
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("input_filename",
+ help="the input PDF file (better if uncompressed)")
+ parser.add_argument("output_filename",
+ help="the output PDF file")
+ parser.add_argument("objects_ids",
+ type=validate_objects_ids,
+ help="a comma-separated list of objects IDs")
+ parser.add_argument("-d", "--debug", action="store_true",
+ help="enable debug output")
+ args = parser.parse_args()
+
+ if args.debug:
+ logger.setLevel(logging.DEBUG)
+
+ pdf_data = PdfReader(args.input_filename)
+
+ objects_names = resolve_objects_names(pdf_data, args.objects_ids)
+ pdf_data = strip_objects(pdf_data, objects_names)
+
+ PdfWriter().write(args.output_filename, pdf_data)
+
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())