import sqlite3
import pandas as pd
import os
import re
import sys


sys.path.append("/home/mike/devel/qurator-mono-repo/experiments/2022-02 digisam columns")
from lib import oai_show#, oai_mets


QURATOR_DATA = "/home/mike/devel/qurator-data"

assert os.path.exists(QURATOR_DATA)


PPN_RE = r"PPN[\dX]+"


con = sqlite3.connect(os.path.join(QURATOR_DATA, "experiments/2022-02-digisam-columns/digisam.sqlite3"))
columns_df = pd.read_sql_query("SELECT * from result", con)
con.close()


# Extract PPN from the filename
columns_df["PPN"] = columns_df["image_file"].str.extract(rf"/({PPN_RE})/")


mods_info = pd.read_pickle(os.path.join(QURATOR_DATA, "digisam/mods_info/mods_info_df_all.2022-04-06.pkl"))
mods_info["PPN"] = mods_info["recordInfo_recordIdentifier"]


# Clean up mods_info a bit
columns_to_drop = []
for c in mods_info.columns:
    if m := re.match(r"^name(\d+)_", c):
        name_index = int(m.group(1))
        if name_index > 1:
            columns_to_drop.append(c)
mods_info = mods_info.drop(columns=columns_to_drop)


# Aggregate document by the most frequent column count of the individual pages.
# If there are multiple most frequent column counts, take the largest.
def max_mode(x):
    return pd.Series.max(pd.Series.mode(x))
columns_by_ppn = columns_df.groupby(["PPN"])["columns"].agg(max_mode)


# The column DataFrame is huge, don't keep it after aggregating
del columns_df


# Merge mods_info and column info
documents = mods_info.merge(columns_by_ppn, left_index=True, right_on="PPN")
# XXX del mods_info


documents


before = len(documents)
documents = documents[documents["columns"] == 1].copy()
print(f"Selected {len(documents)} after keeping only one-column documents (before: {before})")

Selected 155208 after keeping only one-column documents (before: 169625)


def german(v):
    return v is not None and any(l in v for l in ["ger", "de"])

documents["german"] = documents["language_languageTerm"].apply(german)

before = len(documents)
documents = documents[documents["german"] == True].copy()
print(f"Selected {len(documents)} after keeping only german documents (before: {before})")

Selected 70590 after keeping only german documents (before: 155208)


def druck(v):
    return v is not None and any(l in v for l in ["Historische Drucke"])

documents["druck"] = documents["classification-ZVDD"].apply(druck)

before = len(documents)
documents = documents[documents["druck"] == True].copy()
print(f"Selected {len(documents)} after keeping only documents with 'Historische Drucke' classification (before: {before})")

Selected 64259 after keeping only documents with 'Historische Drucke' classification (before: 70590)


# XXX Only here for debugging

import requests
from lxml import etree as ET

XMLNS = {
   'mets': 'http://www.loc.gov/METS/',
   'xlink': 'http://www.w3.org/1999/xlink'
}

def oai_mets(ppn):
    """Retrieve METS metadata for a given PPN."""
    
    API_URL = 'https://digital.staatsbibliothek-berlin.de/oai'
    IDENTIFIER_TEMPLATE = 'oai:digital.staatsbibliothek-berlin.de:%s'

    params = {
        'verb': 'GetRecord',
        'metadataPrefix': 'mets',
        'identifier': IDENTIFIER_TEMPLATE % ppn
    }
    
    r = requests.get(API_URL, params=params)
    print(r)
    print(r.content)
    # TODO parse error here
    # <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
    #   <responseDate>2022-09-26T17:13:28Z</responseDate>
    #   <request>https://oai.sbb.berlin/</request>\n  <error code="idDoesNotExist">oai:digital.staatsbibliothek-berlin.de:22069</error>\n</OAI-PMH>
    tree = ET.fromstring(r.content)
    print(tree)
    
    return tree.find('.//mets:mets', XMLNS)


sample = documents.sample(n=25)
for ppn in sample.index:
    oai_show(ppn, show_info=False)
    print(ppn)

PPN815812620

PPN790473674

PPN1025431596

PPN731602811

PPN644767820

PPN799253960

PPN735111685

PPN736747524

PPN767491319

PPN746232667

	accessCondition-use and reproduction	accessCondition-restriction on access	classification-ZVDD	genre-aad	identifier-purl	identifier-vd17	language_languageTerm	location_physicalLocation	location_shelfLocator	name0_displayForm	...	classification-ark	abstract	accessCondition-embargo enddate	identifier-KSTO	identifier-EC1418	identifier-ISSN	identifier-VD17	genre	PPN	columns
PPN
PPN668289767	Public Domain Mark 1.0	open access	{Historische Drucke, Theologie}	{Leichenpredigt}	http://resolver.staatsbibliothek-berlin.de/SBB...	1:025021F	{ger}		Ee 700-418	Fienius, Johann	...	None	None	None	None	None	None	None	None	PPN668289767	1
PPN1741614708	Public Domain Mark 1.0	None	{Musiknoten, Musikhandschriften}	None	http://resolver.staatsbibliothek-berlin.de/SBB...	None	None	Staatsbibliothek zu Berlin - Preußischer Kultu...	Am.B 191	None	...	None	None	None	None	None	None	None	None	PPN1741614708	1
PPN1688518711	CC BY-NC-SA 4.0 International	None	{Musik, Schott-Archiv, Nachlässe und Autographe}	None	http://resolver.staatsbibliothek-berlin.de/SBB...	None	None	Staatsbibliothek zu Berlin - Preußischer Kultu...	55 Nachl 100/B,28055	None	...	None	None	None	None	None	None	None	None	PPN1688518711	1
PPN1037645456	CC BY-NC-SA 4.0 International	None	{Musik, Schott-Archiv, Nachlässe und Autographe}	None	http://resolver.staatsbibliothek-berlin.de/SBB...	None	None		55 Nachl 100/B,9715	André, Johann Anton	...	None	None	None	None	None	None	None	None	PPN1037645456	1
PPN1784531499	Public Domain Mark 1.0	open access	{Musiknoten, Musikhandschriften}	None	https://resolver.staatsbibliothek-berlin.de/SB...	None	None	Staatsbibliothek zu Berlin - Preußischer Kultu...	Mus.ms.autogr. Reger, M. 34	None	...	None	None	None	None	None	None	None	None	PPN1784531499	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
PPN838097693	Public Domain Mark 1.0	open access	{Historische Drucke, Theologie, VD18 digital}	{Epikedeion}	http://resolver.staatsbibliothek-berlin.de/SBB...	None	{ger}		100 in: 4" Ee 710-5	C. F. R.	...	None	None	None	None	None	None	None	None	PPN838097693	1
PPN832954616	None	None	{Nachlässe und Autographe, Alexander von Humbo...	{Werke}	http://resolver.staatsbibliothek-berlin.de/SBB...	None	{de}		Nachl. Alexander von Humboldt, gr. Kasten 11, ...	Humboldt, Alexander von	...	None	None	None	None	None	None	None	None	PPN832954616	1
PPN749492228	Public Domain Mark 1.0	open access	{Historische Drucke, Krieg 1914-1918}	None	http://resolver.staatsbibliothek-berlin.de/SBB...	None	{ger}		4"Krieg 1914/28149-18.1918	European Commission	...	None	None	None	None	None	None	None	None	PPN749492228	2
PPN746970110	Public Domain Mark 1.0	open access	{Historische Drucke, Rechtswissenschaft}	None	http://resolver.staatsbibliothek-berlin.de/SBB...	None	{ger}		Oy 1004-25.1893	Deutsche Forschungsgemeinschaft	...	None	None	None	None	None	None	None	None	PPN746970110	1
PPN847185974	None	None	{Musik, Nachlässe und Autographe}	None	http://resolver.staatsbibliothek-berlin.de/SBB...	None	None	None	Mus.Nachl. F. Busoni B I, 909	Busoni, Ferruccio	...	None	None	None	None	None	None	None	None	PPN847185974	1

Preamble¶

Prepare data¶

Columns¶

mods_info¶

TODO¶

Select suitable documents¶

A sample¶