1
2
3
4
5
6
7
8
|
>>> from tika import parser
>>> file_data = parser.from_file('340395.pdf')
>>> file_data.keys()
dict_keys(['metadata', 'content', 'status'])
>>> meta = file_data['metadata']
>>> meta
{'Content-Type': 'application/pdf', 'Creation-Date': '2020-07-11T09:07:21Z', 'X-Parsed-By': ['org.apache.tika.parser.DefaultParser', 'org.apache.tika.parser.pdf.PDFParser'], 'X-TIKA:content_handler': 'ToTextContentHandler', 'X-TIKA:embedded_depth': '0', 'X-TIKA:parse_time_millis': '1148', 'access_permission:assemble_document': 'true', 'access_permission:can_modify': 'true', 'access_permission:can_print': 'true', 'access_permission:can_print_degraded': 'true', 'access_permission:extract_content': 'true', 'access_permission:extract_for_accessibility': 'true', 'access_permission:fill_in_form': 'true', 'access_permission:modify_annotations': 'true', 'created': '2020-07-11T09:07:21Z', 'dc:format': 'application/pdf; version=1.4', 'dcterms:created': '2020-07-11T09:07:21Z', 'meta:creation-date': '2020-07-11T09:07:21Z', 'pdf:PDFVersion': '1.4', 'pdf:charsPerPage': ['1384', '1509', '1508', '1460', '1471', '1394', '1440', '1471', '1432', '1548', '1459', '1014'], 'pdf:docinfo:created': '2020-07-11T09:07:21Z', 'pdf:docinfo:creator_tool': 'DocuWorks PDF Driver 7.0.4', 'pdf:docinfo:producer': 'DocuWorks PDF Build 9', 'pdf:encrypted': 'false', 'pdf:hasMarkedContent': 'false', 'pdf:hasXFA': 'false', 'pdf:hasXMP': 'false', 'pdf:unmappedUnicodeCharsPerPage': ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], 'producer': 'DocuWorks PDF Build 9', 'resourceName': "b'340395.pdf'", 'xmp:CreatorTool': 'DocuWorks PDF Driver 7.0.4', 'xmpTPg:NPages': '12'}
|