Skip to content

Commit

Permalink
Merge branch 'ReDoOCR' into 4.x-ocr
Browse files Browse the repository at this point in the history
  • Loading branch information
csidirop committed Nov 28, 2023
2 parents acf1f7e + 09ee1c8 commit cd389bf
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 40 deletions.
72 changes: 41 additions & 31 deletions Classes/Controller/PageViewController.php
Original file line number Diff line number Diff line change
Expand Up @@ -144,27 +144,36 @@ protected function getFulltext($page)
// Get fulltext link:
$fileGrpsFulltext = GeneralUtility::trimExplode(',', $this->extConf['fileGrpFulltext']);

while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
//check if and where fulltext is present:
if (!empty($this->document->getDoc()->physicalStructureInfo[$this->document->getDoc()->physicalStructure[$page]]['files'][$fileGrpFulltext])) { //fulltext is remote present
$fulltext['url'] = $this->document->getDoc()->getFileLocation($this->document->getDoc()->physicalStructureInfo[$this->document->getDoc()->physicalStructure[$page]]['files'][$fileGrpFulltext]);
if ($this->settings['useInternalProxy']) {
// Configure @action URL for form.
$uri = $this->uriBuilder->reset()
->setTargetPageUid($GLOBALS['TSFE']->id)
->setCreateAbsoluteUri(!empty($this->settings['forceAbsoluteUrl']) ? true : false)
->setArguments([
'eID' => 'tx_dlf_pageview_proxy',
'url' => $fulltext['url'],
'uHash' => GeneralUtility::hmac($fulltext['url'], 'PageViewProxy')
])
->build();

$fulltext['url'] = $uri;
if (PageViewController::getOCRengine(Doc::$extKey) == "originalremote") {
//check if remote fulltext exists:
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
if (!empty($this->document->getDoc()->physicalStructureInfo[$this->document->getDoc()->physicalStructure[$page]]['files'][$fileGrpFulltext])) { //fulltext is remote present
$fulltext['url'] = $this->document->getDoc()->getFileLocation($this->document->getDoc()->physicalStructureInfo[$this->document->getDoc()->physicalStructure[$page]]['files'][$fileGrpFulltext]);
if ($this->settings['useInternalProxy']) {
// Configure @action URL for form.
$uri = $this->uriBuilder->reset()
->setTargetPageUid($GLOBALS['TSFE']->id)
->setCreateAbsoluteUri(!empty($this->settings['forceAbsoluteUrl']) ? true : false)
->setArguments([
'eID' => 'tx_dlf_pageview_proxy',
'url' => $fulltext['url'],
'uHash' => GeneralUtility::hmac($fulltext['url'], 'PageViewProxy')
])
->build();

$fulltext['url'] = $uri;
}
$fulltext['mimetype'] = $this->document->getDoc()->getFileMimeType($this->document->getDoc()->physicalStructureInfo[$this->document->getDoc()->physicalStructure[$page]]['files'][$fileGrpFulltext]);
setcookie('tx-dlf-ocr-remotepresent', "Y", ['SameSite' => 'lax']);
break;
} else { //no fulltext present
$this->logger->notice('No full-text file found for page "' . $page . '" in fileGrp "' . $fileGrpFulltext . '"');
setcookie('tx-dlf-ocr-remotepresent', "N", ['SameSite' => 'lax']);
}
$fulltext['mimetype'] = $this->document->getDoc()->getFileMimeType($this->document->getDoc()->physicalStructureInfo[$this->document->getDoc()->physicalStructure[$page]]['files'][$fileGrpFulltext]);
break;
} else if (FullTextGenerator::checkLocal(Doc::$extKey, $this->document, $page)) { //fulltext is locally present
}
} else {
//check if local fulltext exists:
if (FullTextGenerator::checkLocal(Doc::$extKey, $this->document, $page)) { //fulltext is locally present
//check server protocol (https://stackoverflow.com/a/14270161):
if ( isset($_SERVER['HTTPS']) && ($_SERVER['HTTPS'] == 'on' || $_SERVER['HTTPS'] == 1)
|| isset($_SERVER['HTTP_X_FORWARDED_PROTO']) && $_SERVER['HTTP_X_FORWARDED_PROTO'] == 'https') {
Expand All @@ -174,10 +183,9 @@ protected function getFulltext($page)
}
$fulltext['url'] = $protocol . $_SERVER['HTTP_HOST'] . "/" . FullTextGenerator::getPageLocalPath(Doc::$extKey, $this->document, $page);
$fulltext['mimetype'] = "text/xml";
} else { //no fulltext present
$this->logger->notice('No full-text file found for page "' . $page . '" in fileGrp "' . $fileGrpFulltext . '"');
}
}

if (empty($fulltext)) {
$this->logger->notice('No full-text file found for page "' . $page . '" in fileGrps "' . $this->extConf['fileGrpFulltext'] . '"');
}
Expand Down Expand Up @@ -390,14 +398,12 @@ protected function checkFulltextAvailability(int $page):array {
*/
public static function getOCRengine(string $extKey):string {
$conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get($extKey);
$ocrEngine = '';

if(!is_null($_COOKIE['tx-dlf-ocrEngine']) && str_contains(self::$ocrEngines, $_COOKIE['tx-dlf-ocrEngine'])){
$ocrEngine = $_COOKIE['tx-dlf-ocrEngine'];
if(!is_null($_COOKIE['tx-dlf-ocrEngine']) && (str_contains(self::$ocrEngines, $ocrEngine=$_COOKIE['tx-dlf-ocrEngine']) || $ocrEngine == "originalremote")){
return $ocrEngine;
} else {
$ocrEngine = $conf['ocrEngine'] ; //get default default value
return $conf['ocrEngine'] ; //get default default value
}
return $ocrEngine;
}

/**
Expand All @@ -408,19 +414,23 @@ public static function getOCRengine(string $extKey):string {
* @return void
*/
protected function generateFullText():void {
//OCR all pages: type=book
if(($engine = PageViewController::getOCRengine(Doc::$extKey)) == "originalremote") {
return;
}

// OCR all pages: (type=book)
if($_POST["request"]["type"] == "book") {
//collect all image urls:
$images = array();
for ($i=1; $i <= $this->document->getDoc()->numPages; $i++) {
$images[$i] = $this->getImage($i)["url"];
}
FullTextGenerator::createBookFullText(Doc::$extKey, $this->document, $images, self::getOCRengine(Doc::$extKey));
FullTextGenerator::createBookFullText(Doc::$extKey, $this->document, $images, $engine);
return;
}

//OCR only this page
FullTextGenerator::createPageFullText(Doc::$extKey, $this->document, $this->getImage($this->requestData['page'])["url"], $this->requestData['page'], self::getOCRengine(Doc::$extKey));
// OCR only this page:
FullTextGenerator::createPageFullText(Doc::$extKey, $this->document, $this->getImage($this->requestData['page'])["url"], $this->requestData['page'], $engine);
}

/**
Expand Down
4 changes: 2 additions & 2 deletions Classes/Plugin/FullTextGenerator.php
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ protected static function getPageLocalId(Doc $doc, int $pageNum):string {
public static function getDocLocalPath(string $extKey, Document $document):string {
$conf = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance(ExtensionConfiguration::class)->get($extKey);

return $conf['fulltextFolder'] . "/" . self::generateUniqueDocLocalPath($document);
return $conf['fulltextFolder'] . self::generateUniqueDocLocalPath($document);
}

/**
Expand Down Expand Up @@ -212,7 +212,7 @@ protected static function generatePageOCR(string $extKey, array $conf, Document
$outputFolderPath = "$documentPath/$ocrEngine"; //Fulltextfolder (eg. fileadmin/fulltextFolder/URN/nbn/de/bsz/180/digosi/30/tesseract-basic/)
$origMetsPath = $documentPath."/".self::getDocLocalId($doc).".xml"; //Path to original METS (eg. fileadmin/fulltextFolder/URN/nbn/de/bsz/180/digosi/30/log59088.xml)
$newMetsPath = $outputFolderPath."/".self::getDocLocalId($doc).".xml"; //Path to updated METS
$outputPath = "$outputFolderPath/$pageId.xml"; //Fulltextfile path
$outputPath = "$outputFolderPath/$pageId.xml"; //Fulltextfile path (eg. fileadmin/fulltextFolder/URN/nbn/de/bsz/180/digosi/30/tesseract-basic/log59088_295.xml)
$tmpOutputFolderPath = $conf['fulltextTempFolder'] . self::generateUniqueDocLocalPath($document) . "/$ocrEngine"; //(eg. fileadmin/_temp_/ocrTempFolder/fulltext/URN/nbn/de/bsz/180/digosi/30/tesseract-basic)
$tmpImagePath = $conf['fulltextImagesFolder'] . self::generateUniqueDocLocalPath($document) . "/$pageId"; //Imagefile path (eg. fileadmin/_temp_/ocrTempFolder/images/URN/nbn/de/bsz/180/digosi/30/log59088_1)
$tmpOutputPath = $tmpOutputFolderPath . "/$pageId"; //Fulltextfile temporary path (eg. fileadmin/_temp_/ocrTempFolder/fulltext/URN/nbn/de/bsz/180/digosi/30/tesseract-basic/log59088_295)
Expand Down
4 changes: 2 additions & 2 deletions Classes/Plugin/Tools/FullTextGenerationScripts/OCRmain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ done
SECONDS=0 #messure time

# Run given OCR-Engine:
$ocrEngine --pageId $pageId --imagePath $imagePath --outputPath $tmpOutputPath --tmpImagePath $tmpImagePath
$ocrEngine --pageId $pageId --pageNum $pageNum --imagePath $imagePath --outputPath $tmpOutputPath --tmpImagePath $tmpImagePath

# Move temporary output file to final location, if it is not already there:
if [ "$outputPath" != "$tmpOutputPath" ]; then
Expand All @@ -83,7 +83,7 @@ fi

# Update METS file:
if [ "$ocrUpdateMets" == "1" ]; then
./typo3conf/ext/dlf/Classes/Plugin/Tools/FullTextGenerationScripts/UpdateMets.sh --pageId $pageId --outputPath $outputPath --url $url --ocrEngine $ocrEngine --ocrIndexMets $ocrIndexMets
./typo3conf/ext/dlf/Classes/Plugin/Tools/FullTextGenerationScripts/UpdateMets.sh --pageId $pageId --pageNum $pageNum --outputPath $outputPath --url $url --ocrEngine $ocrEngine --ocrIndexMets $ocrIndexMets
fi

echo -e "OCR completed in $SECONDS seconds"
Expand Down
31 changes: 26 additions & 5 deletions Classes/Plugin/Tools/FullTextGenerationScripts/UpdateMets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ set -euo pipefail # exit on: error, undefined variable, pipefail
while [ $# -gt 0 ] ; do
case $1 in
--pageId) pageId="$2" ;; # Page ID (eg. log59088_1)
--pageNum) pageNum="$2" ;; # Page number (eg. 1)
--url) url="$2" ;; # Alto URL (eg. http://localhost/fileadmin/fulltextFolder//URN/nbn/de/bsz/180/digosi/30/tesseract-basic/log59088_1.xml)
--outputPath) outputPath="$2" ;; # Fulltextfile path (eg. /var/www/typo3/public/fileadmin/fulltextfolder/URN/nbn/de/bsz/180/digosi/30/tesseract-basic/log59088_1.xml)
--ocrEngine) ocrEngine="$2" ;; # OCR-Engine (eg. /var/www/typo3/public/typo3conf/ext/dlf/Classes/Plugin/Tools/FullTextGenerationScripts/tesseract-basic.sh)
Expand All @@ -21,7 +22,7 @@ done

# Extract some values from parameters:
docLocalId=$(rev <<< "$pageId" | cut -d _ -f 2- | rev) # (eg. log_59088_1 -> log_59088)
pageNum=$(rev <<< "$pageId" | cut -d _ -f 1 | rev) # (eg. log_59088_1 -> 1)
#pageNum=$(rev <<< "$pageId" | cut -d _ -f 1 | rev) # (eg. log_59088_1 -> 1)
outputFolder=$(rev <<< "$outputPath" | cut -d / -f 2- | rev) # (eg. /var/www/typo3/public/fileadmin/fulltextfolder/URN/nbn/de/bsz/180/digosi/30/tesseract-basic)
ocrEngine=$(rev <<< "$ocrEngine" | cut -d '/' -f 1 | cut -d '.' -f 2- | rev) # (eg. tesseract-basic)
metsUrl=$(rev <<< "$url" | cut -d / -f 2- | rev)"/$docLocalId.xml"
Expand Down Expand Up @@ -49,10 +50,30 @@ if [ $oai ] ; then
mv mets_tmp.xml mets.xml
fi

# Update METS with given ALTO file:
ocrd --log-level INFO workspace add --force --file-grp FULLTEXT --file-id "fulltext-$pageId" --page-id="fulltext-$pageNum" --mimetype text/xml "$url"
xmlstarlet ed -L -a "//mets:file[@ID='fulltext-$pageId']" -t attr -n "CREATED" -v "$(date +%Y-%m-%dT%H:%M:%S%z)" mets.xml # Add Date attribute to file node
xmlstarlet ed -L -a "//mets:file[@ID='fulltext-$pageId']" -t attr -n "SOFTWARE" -v "DFG-Viewer-OCR-On-Demand-$ocrEngine" mets.xml # Add OCR-ENGINE attribute to file node
# Check if there is already a FULLTEXT section for the given pageId:
# 1. Get all FILEIDs from structMap for given page number:
physID=$(xmlstarlet sel -N mets="http://www.loc.gov/METS/" -t -v "//mets:div[@ORDER='$pageNum']/@ID" mets.xml)
fileIdList=($(xmlstarlet sel -N mets="http://www.loc.gov/METS/" -t -v '//mets:structMap[@TYPE="PHYSICAL"]/mets:div/mets:div[@ID="'$physID'"]/mets:fptr/@FILEID' -n mets.xml ));
# 2. Check if there is already a FULLTEXT section for the given fileId:
updated=0 # Flag to check if METS was updated
for fileId in "${fileIdList[@]}"; do
if [[ -n $(xmlstarlet sel -N mets="http://www.loc.gov/METS/" -t -v '//mets:fileSec/mets:fileGrp[@USE="FULLTEXT"]/mets:file[@ID="'$fileId'"]/mets:FLocat/@xlink:href' -n mets.xml) ]] ; then
updated=1 # Set flag to 1

# Update METS by updating existing elements with given ALTO file:
ocrd --log-level INFO workspace add --force --file-grp FULLTEXT --file-id "$fileId" --page-id="$physID" --mimetype text/xml "$url"
xmlstarlet ed -L -N mets="http://www.loc.gov/METS/" -u "//mets:file[@ID='$fileId']/@CREATED" -v "$(date +%Y-%m-%dT%H:%M:%S%z)" -i "//mets:file[@ID='$fileId'][not(@CREATED)]" -t attr -n "CREATED" -v "$(date +%Y-%m-%dT%H:%M:%S%z)" mets.xml # Add/Update date attribute to file node
xmlstarlet ed -L -N mets="http://www.loc.gov/METS/" -u "//mets:file[@ID='$fileId']/@SOFTWARE" -v "DFG-Viewer-OCR-On-Demand-$ocrEngine" -i "//mets:file[@ID='$fileId'][not(@SOFTWARE)]" -t attr -n "SOFTWARE" -v "DFG-Viewer-OCR-On-Demand-$ocrEngine" mets.xml # Add OCR-ENGINE attribute to file node
fi
done

if [[ $updated == 0 ]]; then # No FULLTEXT section for fileId
# Update METS by adding given ALTO file:
ocrd --log-level INFO workspace add --force --file-grp FULLTEXT --file-id "fulltext-$pageId" --page-id="$physID" --mimetype text/xml "$url"
xmlstarlet ed -L -N mets="http://www.loc.gov/METS/" -a "//mets:file[@ID='fulltext-$pageId']" -t attr -n "CREATED" -v "$(date +%Y-%m-%dT%H:%M:%S%z)" mets.xml # Add Date attribute to file node
xmlstarlet ed -L -N mets="http://www.loc.gov/METS/" -a "//mets:file[@ID='fulltext-$pageId']" -t attr -n "SOFTWARE" -v "DFG-Viewer-OCR-On-Demand-$ocrEngine" mets.xml # Add OCR-ENGINE attribute to file node
# ocrd workspace update-page --order "$pageNum" "$physID" # Update physical structMap if needed
fi

# Validate METS:
#apt -y install libxml2-utils
Expand Down

0 comments on commit cd389bf

Please sign in to comment.