#!/usr/bin/env python3 # # pdfstrip.py - strip objects from PDF files by specifying objects IDs # # Copyright (C) 2016 Antonio Ospite # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import argparse import logging import sys from pdfrw import PdfReader, PdfWriter from pdfrw.objects.pdfindirect import PdfIndirect # pylint: disable=invalid-name logger = logging.getLogger(__name__) console_handler = logging.StreamHandler() formatter = logging.Formatter('[%(levelname)s] %(funcName)s:%(lineno)d %(message)s') console_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.propagate = False def resolve_objects_names(pdf, objects_ids): logger.debug("objects_ids %s", objects_ids) objects_names = [] for objnum in objects_ids: indirect_object = pdf.findindirect(objnum, 0) if isinstance(indirect_object, PdfIndirect): try: real_object = indirect_object.real_value() if real_object.Name: objects_names.append(real_object.Name) else: logger.warning("Object %d has an empty 'Name' attribute", objnum) except AttributeError: logger.warning("Object %d has no 'Name' attribute", objnum) else: logger.warning("Object %d is not a PdfIndirect but a %s", objnum, type(indirect_object)) logger.debug("objects_names %s\n", objects_names) return objects_names def strip_objects(pdf, objects_names): for i, page in enumerate(pdf.pages): logger.debug("Page %d", i + 1) logger.debug("Before %s", page.Resources.XObject.keys()) for obj in objects_names: if obj in page.Resources.XObject: del page.Resources.XObject[obj] logger.debug("After %s\n", page.Resources.XObject.keys()) return pdf def validate_objects_ids(objects_ids_string): try: objects_ids = [int(obj) for obj in objects_ids_string.split(',')] except (IndexError, ValueError): raise argparse.ArgumentTypeError("%s contains an invalid value" % objects_ids_string) return objects_ids def main(): parser = argparse.ArgumentParser() parser.add_argument("input_filename", help="the input PDF file (better if uncompressed)") parser.add_argument("output_filename", help="the output PDF file") parser.add_argument("objects_ids", type=validate_objects_ids, help="a comma-separated list of objects IDs") parser.add_argument("-d", "--debug", action="store_true", help="enable debug output") args = parser.parse_args() if args.debug: logger.setLevel(logging.DEBUG) pdf_data = PdfReader(args.input_filename) objects_names = resolve_objects_names(pdf_data, args.objects_ids) pdf_data = strip_objects(pdf_data, objects_names) PdfWriter().write(args.output_filename, pdf_data) return 0 if __name__ == '__main__': sys.exit(main())