Skip to content

Commit

Permalink
TST: Add tests for XMP information
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma committed Jun 15, 2022
1 parent e47e057 commit 0306bd0
Showing 1 changed file with 85 additions and 0 deletions.
85 changes: 85 additions & 0 deletions tests/test_xmp.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import os
from datetime import datetime
from io import BytesIO

import pytest

import PyPDF2.generic
import PyPDF2.xmp
from PyPDF2 import PdfReader

from . import get_pdf_from_url

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "resources")
Expand Down Expand Up @@ -83,6 +86,88 @@ def test_identity(x):
assert PyPDF2.xmp._identity(x) == x


@pytest.mark.parametrize(
("url", "name", "xmpmm_instance_id"),
[
(
"https://corpora.tika.apache.org/base/docs/govdocs1/955/955562.pdf",
"tika-955562.pdf",
"uuid:ca96e032-c2af-49bd-a71c-95889bafbf1d",
)
],
)
def test_xmpmm(url, name, xmpmm_instance_id):
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
xmp_metadata = reader.xmp_metadata
assert xmp_metadata.xmpmm_instanceId == xmpmm_instance_id
# cache hit:
assert xmp_metadata.xmpmm_instanceId == xmpmm_instance_id


def test_dc_description():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/953/953770.pdf"
name = "tika-953770.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
xmp_metadata = reader.xmp_metadata
assert xmp_metadata.dc_description == {
"x-default": "U.S. Title 50 Certification Form"
}
# cache hit:
assert xmp_metadata.dc_description == {
"x-default": "U.S. Title 50 Certification Form"
}


def test_dc_creator():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/953/953770.pdf"
name = "tika-953770.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
xmp_metadata = reader.xmp_metadata
assert xmp_metadata.dc_creator == ["U.S. Fish and Wildlife Service"]
# cache hit:
assert xmp_metadata.dc_creator == ["U.S. Fish and Wildlife Service"]


def test_custom_properties():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/986/986065.pdf"
name = "tika-986065.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
xmp_metadata = reader.xmp_metadata
assert xmp_metadata.custom_properties == {"Style": "Searchable Image (Exact)"}
# cache hit:
assert xmp_metadata.custom_properties == {"Style": "Searchable Image (Exact)"}


def test_dc_subject():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/959/959519.pdf"
name = "tika-959519.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
xmp_metadata = reader.xmp_metadata
assert xmp_metadata.dc_subject == [
"P&P",
"manual",
"1240.2325",
"CVM",
"PROCEDURES ON MEDIA INQUIRIES",
"animal",
"media",
"procedures",
"inquiries",
]
# Cache hit:
assert xmp_metadata.dc_subject == [
"P&P",
"manual",
"1240.2325",
"CVM",
"PROCEDURES ON MEDIA INQUIRIES",
"animal",
"media",
"procedures",
"inquiries",
]


# def test_getter_bag():
# f = PyPDF2.xmp._getter_bag("namespace", "name")
# class Tst: # to replace pdf
Expand Down

0 comments on commit 0306bd0

Please sign in to comment.