import sqlite3
import pandas as pd
import os
import re
import sys
sys.path.append("/home/mike/devel/qurator-mono-repo/experiments/2022-02 digisam columns")
from lib import oai_show#, oai_mets
QURATOR_DATA = "/home/mike/devel/qurator-data"
assert os.path.exists(QURATOR_DATA)
PPN_RE = r"PPN[\dX]+"
con = sqlite3.connect(os.path.join(QURATOR_DATA, "experiments/2022-02-digisam-columns/digisam.sqlite3"))
columns_df = pd.read_sql_query("SELECT * from result", con)
con.close()
# Extract PPN from the filename
columns_df["PPN"] = columns_df["image_file"].str.extract(rf"/({PPN_RE})/")
mods_info = pd.read_pickle(os.path.join(QURATOR_DATA, "digisam/mods_info/mods_info_df_all.2022-04-06.pkl"))
mods_info["PPN"] = mods_info["recordInfo_recordIdentifier"]
# Clean up mods_info a bit
columns_to_drop = []
for c in mods_info.columns:
if m := re.match(r"^name(\d+)_", c):
name_index = int(m.group(1))
if name_index > 1:
columns_to_drop.append(c)
mods_info = mods_info.drop(columns=columns_to_drop)
lib.py
to central placeoai_show(ppn, show_info=True)
only works when mods_info_df exists# Aggregate document by the most frequent column count of the individual pages.
# If there are multiple most frequent column counts, take the largest.
def max_mode(x):
return pd.Series.max(pd.Series.mode(x))
columns_by_ppn = columns_df.groupby(["PPN"])["columns"].agg(max_mode)
# The column DataFrame is huge, don't keep it after aggregating
del columns_df
# Merge mods_info and column info
documents = mods_info.merge(columns_by_ppn, left_index=True, right_on="PPN")
# XXX del mods_info
documents
accessCondition-use and reproduction | accessCondition-restriction on access | classification-ZVDD | genre-aad | identifier-purl | identifier-vd17 | language_languageTerm | location_physicalLocation | location_shelfLocator | name0_displayForm | ... | classification-ark | abstract | accessCondition-embargo enddate | identifier-KSTO | identifier-EC1418 | identifier-ISSN | identifier-VD17 | genre | PPN | columns | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PPN | |||||||||||||||||||||
PPN668289767 | Public Domain Mark 1.0 | open access | {Historische Drucke, Theologie} | {Leichenpredigt} | http://resolver.staatsbibliothek-berlin.de/SBB... | 1:025021F | {ger} | Ee 700-418 | Fienius, Johann | ... | None | None | None | None | None | None | None | None | PPN668289767 | 1 | |
PPN1741614708 | Public Domain Mark 1.0 | None | {Musiknoten, Musikhandschriften} | None | http://resolver.staatsbibliothek-berlin.de/SBB... | None | None | Staatsbibliothek zu Berlin - Preußischer Kultu... | Am.B 191 | None | ... | None | None | None | None | None | None | None | None | PPN1741614708 | 1 |
PPN1688518711 | CC BY-NC-SA 4.0 International | None | {Musik, Schott-Archiv, Nachlässe und Autographe} | None | http://resolver.staatsbibliothek-berlin.de/SBB... | None | None | Staatsbibliothek zu Berlin - Preußischer Kultu... | 55 Nachl 100/B,28055 | None | ... | None | None | None | None | None | None | None | None | PPN1688518711 | 1 |
PPN1037645456 | CC BY-NC-SA 4.0 International | None | {Musik, Schott-Archiv, Nachlässe und Autographe} | None | http://resolver.staatsbibliothek-berlin.de/SBB... | None | None | 55 Nachl 100/B,9715 | André, Johann Anton | ... | None | None | None | None | None | None | None | None | PPN1037645456 | 1 | |
PPN1784531499 | Public Domain Mark 1.0 | open access | {Musiknoten, Musikhandschriften} | None | https://resolver.staatsbibliothek-berlin.de/SB... | None | None | Staatsbibliothek zu Berlin - Preußischer Kultu... | Mus.ms.autogr. Reger, M. 34 | None | ... | None | None | None | None | None | None | None | None | PPN1784531499 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
PPN838097693 | Public Domain Mark 1.0 | open access | {Historische Drucke, Theologie, VD18 digital} | {Epikedeion} | http://resolver.staatsbibliothek-berlin.de/SBB... | None | {ger} | 100 in: 4" Ee 710-5 | C. F. R. | ... | None | None | None | None | None | None | None | None | PPN838097693 | 1 | |
PPN832954616 | None | None | {Nachlässe und Autographe, Alexander von Humbo... | {Werke} | http://resolver.staatsbibliothek-berlin.de/SBB... | None | {de} | Nachl. Alexander von Humboldt, gr. Kasten 11, ... | Humboldt, Alexander von | ... | None | None | None | None | None | None | None | None | PPN832954616 | 1 | |
PPN749492228 | Public Domain Mark 1.0 | open access | {Historische Drucke, Krieg 1914-1918} | None | http://resolver.staatsbibliothek-berlin.de/SBB... | None | {ger} | 4"Krieg 1914/28149-18.1918 | European Commission | ... | None | None | None | None | None | None | None | None | PPN749492228 | 2 | |
PPN746970110 | Public Domain Mark 1.0 | open access | {Historische Drucke, Rechtswissenschaft} | None | http://resolver.staatsbibliothek-berlin.de/SBB... | None | {ger} | Oy 1004-25.1893 | Deutsche Forschungsgemeinschaft | ... | None | None | None | None | None | None | None | None | PPN746970110 | 1 | |
PPN847185974 | None | None | {Musik, Nachlässe und Autographe} | None | http://resolver.staatsbibliothek-berlin.de/SBB... | None | None | None | Mus.Nachl. F. Busoni B I, 909 | Busoni, Ferruccio | ... | None | None | None | None | None | None | None | None | PPN847185974 | 1 |
169625 rows × 73 columns
We keep only
before = len(documents)
documents = documents[documents["columns"] == 1].copy()
print(f"Selected {len(documents)} after keeping only one-column documents (before: {before})")
Selected 155208 after keeping only one-column documents (before: 169625)
def german(v):
return v is not None and any(l in v for l in ["ger", "de"])
documents["german"] = documents["language_languageTerm"].apply(german)
before = len(documents)
documents = documents[documents["german"] == True].copy()
print(f"Selected {len(documents)} after keeping only german documents (before: {before})")
Selected 70590 after keeping only german documents (before: 155208)
def druck(v):
return v is not None and any(l in v for l in ["Historische Drucke"])
documents["druck"] = documents["classification-ZVDD"].apply(druck)
before = len(documents)
documents = documents[documents["druck"] == True].copy()
print(f"Selected {len(documents)} after keeping only documents with 'Historische Drucke' classification (before: {before})")
Selected 64259 after keeping only documents with 'Historische Drucke' classification (before: 70590)
# XXX Only here for debugging
import requests
from lxml import etree as ET
XMLNS = {
'mets': 'http://www.loc.gov/METS/',
'xlink': 'http://www.w3.org/1999/xlink'
}
def oai_mets(ppn):
"""Retrieve METS metadata for a given PPN."""
API_URL = 'https://digital.staatsbibliothek-berlin.de/oai'
IDENTIFIER_TEMPLATE = 'oai:digital.staatsbibliothek-berlin.de:%s'
params = {
'verb': 'GetRecord',
'metadataPrefix': 'mets',
'identifier': IDENTIFIER_TEMPLATE % ppn
}
r = requests.get(API_URL, params=params)
print(r)
print(r.content)
# TODO parse error here
# <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
# <responseDate>2022-09-26T17:13:28Z</responseDate>
# <request>https://oai.sbb.berlin/</request>\n <error code="idDoesNotExist">oai:digital.staatsbibliothek-berlin.de:22069</error>\n</OAI-PMH>
tree = ET.fromstring(r.content)
print(tree)
return tree.find('.//mets:mets', XMLNS)
sample = documents.sample(n=25)
for ppn in sample.index:
oai_show(ppn, show_info=False)
print(ppn)
PPN815812620
PPN790473674
PPN1025431596
PPN731602811
PPN644767820
PPN799253960
PPN735111685
PPN736747524
PPN767491319
PPN746232667
PPN721239994
PPN81771037X
PPN866203958
PPN1031100512
PPN1675671915
PPN74635200X
PPN64162249X
PPN1686726694
PPN738083690
PPN634934023
PPN715006037
PPN718540603
PPN717287912
PPN801012791
PPN1019851848