<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//TaxonX//DTD Taxonomic Treatment Publishing DTD v0 20100105//EN" "../../nlm/tax-treatment-NS0.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:tp="http://www.plazi.org/taxpub" article-type="research-article">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">17</journal-id>
      <journal-id journal-id-type="index">urn:lsid:arphahub.com:pub:8E638694-B4E0-570A-856A-746FF325BF6B</journal-id>
      <journal-title-group>
        <journal-title xml:lang="en">Research Ideas and Outcomes</journal-title>
        <abbrev-journal-title xml:lang="en">RIO</abbrev-journal-title>
      </journal-title-group>
      <issn pub-type="epub">2367-7163</issn>
      <publisher>
        <publisher-name>Pensoft Publishers</publisher-name>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="doi">10.3897/rio.6.e58030</article-id>
      <article-id pub-id-type="publisher-id">58030</article-id>
      <article-id pub-id-type="manuscript">14338</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Project Report</subject>
        </subj-group>
        <subj-group subj-group-type="conference-part">
          <subject>ICEDIG Project Outcomes</subject>
        </subj-group>
        <subj-group subj-group-type="sdg">
          <subject>Life on land</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Towards a scientific workflow featuring Natural Language Processing for the digitisation of natural history collections</article-title>
      </title-group>
      <contrib-group content-type="authors">
        <contrib contrib-type="author" corresp="no">
          <name name-style="western">
            <surname>Owen</surname>
            <given-names>David</given-names>
          </name>
          <uri content-type="orcid">https://orcid.org/0000-0002-4028-0591</uri>
          <xref ref-type="aff" rid="A1">1</xref>
        </contrib>
        <contrib contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Groom</surname>
            <given-names>Quentin</given-names>
          </name>
          <email xlink:type="simple">quentin.groom@plantentuinmeise.be</email>
          <uri content-type="orcid">https://orcid.org/0000-0002-0596-5376</uri>
          <xref ref-type="aff" rid="A2">2</xref>
        </contrib>
        <contrib contrib-type="author" corresp="no">
          <name name-style="western">
            <surname>Hardisty</surname>
            <given-names>Alex</given-names>
          </name>
          <uri content-type="orcid">https://orcid.org/0000-0002-0767-4310</uri>
          <xref ref-type="aff" rid="A1">1</xref>
        </contrib>
        <contrib contrib-type="author" corresp="no">
          <name name-style="western">
            <surname>Leegwater</surname>
            <given-names>Thijs</given-names>
          </name>
          <xref ref-type="aff" rid="A3">3</xref>
        </contrib>
        <contrib contrib-type="author" corresp="no">
          <name name-style="western">
            <surname>Livermore</surname>
            <given-names>Laurence</given-names>
          </name>
          <uri content-type="orcid">https://orcid.org/0000-0002-7341-1842</uri>
          <xref ref-type="aff" rid="A4">4</xref>
        </contrib>
        <contrib contrib-type="author" corresp="no">
          <name name-style="western">
            <surname>van Walsum</surname>
            <given-names>Myriam</given-names>
          </name>
          <xref ref-type="aff" rid="A5">5</xref>
        </contrib>
        <contrib contrib-type="author" corresp="no">
          <name name-style="western">
            <surname>Wijkamp</surname>
            <given-names>Noortje</given-names>
          </name>
          <xref ref-type="aff" rid="A3">3</xref>
        </contrib>
        <contrib contrib-type="author" corresp="no">
          <name name-style="western">
            <surname>Spasić</surname>
            <given-names>Irena</given-names>
          </name>
          <xref ref-type="aff" rid="A1">1</xref>
        </contrib>
      </contrib-group>
      <aff id="A1">
        <label>1</label>
        <addr-line content-type="verbatim">Cardiff University, Cardiff, United Kingdom</addr-line>
        <institution>Cardiff University</institution>
        <addr-line content-type="city">Cardiff</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="A2">
        <label>2</label>
        <addr-line content-type="verbatim">Meise Botanic Garden, Meise, Belgium</addr-line>
        <institution>Meise Botanic Garden</institution>
        <addr-line content-type="city">Meise</addr-line>
        <country>Belgium</country>
      </aff>
      <aff id="A3">
        <label>3</label>
        <addr-line content-type="verbatim">Picturae, Heerhugowaard, Netherlands</addr-line>
        <institution>Picturae</institution>
        <addr-line content-type="city">Heerhugowaard</addr-line>
        <country>Netherlands</country>
      </aff>
      <aff id="A4">
        <label>4</label>
        <addr-line content-type="verbatim">The Natural History Museum, London, United Kingdom</addr-line>
        <institution>The Natural History Museum</institution>
        <addr-line content-type="city">London</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="A5">
        <label>5</label>
        <addr-line content-type="verbatim">Naturalis Biodiversity Centre, Leiden, Netherlands</addr-line>
        <institution>Naturalis Biodiversity Centre</institution>
        <addr-line content-type="city">Leiden</addr-line>
        <country>Netherlands</country>
      </aff>
      <author-notes>
        <fn fn-type="corresp">
          <p>Corresponding author: Quentin Groom (<email xlink:type="simple">quentin.groom@plantentuinmeise.be</email>).</p>
        </fn>
        <fn fn-type="edited-by">
          <p>Academic editor: </p>
        </fn>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>28</day>
        <month>08</month>
        <year>2020</year>
      </pub-date>
      <volume>6</volume>
      <elocation-id>e58030</elocation-id>
      <uri content-type="arpha" xlink:href="http://openbiodiv.net/4FE5F7ED-3631-5C2B-84B1-903AE3C2FCA2">4FE5F7ED-3631-5C2B-84B1-903AE3C2FCA2</uri>
      <history>
        <date date-type="received">
          <day>26</day>
          <month>08</month>
          <year>2020</year>
        </date>
      </history>
      <permissions>
        <copyright-statement>David Owen, Quentin Groom, Alex Hardisty, Thijs Leegwater, Laurence Livermore, Myriam van Walsum, Noortje Wijkamp, Irena Spasić</copyright-statement>
        <license license-type="creative-commons-attribution" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
          <license-p>This is an open access article distributed under the terms of the Creative Commons Attribution License (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
        </license>
      </permissions>
      <abstract>
        <label>Abstract</label>
        <p>We describe an effective approach to automated text digitisation with respect to natural history specimen labels. These labels contain much useful data about the specimen including its collector, country of origin, and collection date. Our approach to automatically extracting these data takes the form of a pipeline. Recommendations are made for the pipeline's component parts based on state-of-the-art technologies.</p>
        <p>Optical Character Recognition (OCR) can be used to digitise text on images of specimens. However, recognising text quickly and accurately from these images can be a challenge for OCR. We show that OCR performance can be improved by prior segmentation of specimen images into their component parts. This ensures that only text-bearing labels are submitted for OCR processing as opposed to whole specimen images, which inevitably contain non-textual information that may lead to false positive readings. In our testing Tesseract OCR version 4.0.0 offers promising text recognition accuracy with segmented images.</p>
        <p>Not all the text on specimen labels is printed. Handwritten text varies much more and does not conform to standard shapes and sizes of individual characters, which poses an additional challenge for OCR. Recently, deep learning has allowed for significant advances in this area. Google's Cloud Vision, which is based on deep learning, is trained on large-scale datasets, and is shown to be quite adept at this task. This may take us some way towards negating the need for humans to routinely transcribe handwritten text.</p>
        <p>Determining the countries and collectors of specimens has been the goal of previous automated text digitisation research activities. Our approach also focuses on these two pieces of information. An area of Natural Language Processing (NLP) known as Named Entity Recognition (NER) has matured enough to semi-automate this task. Our experiments demonstrated that existing approaches can accurately recognise location and person names within the text extracted from segmented images via Tesseract version 4.0.0.</p>
        <p>We have highlighted the main recommendations for potential pipeline components. The paper also provides guidance on selecting appropriate software solutions. These include automatic language identification, terminology extraction, and integrating all pipeline components into a scientific workflow to automate the overall digitisation process.</p>
      </abstract>
      <kwd-group>
        <label>Keywords</label>
        <kwd>automated text digitisation</kwd>
        <kwd>natural language processing</kwd>
        <kwd>named entity recognition</kwd>
        <kwd>optical character recognition</kwd>
        <kwd>handwritten text recognition</kwd>
        <kwd>language identification</kwd>
        <kwd>terminology extraction</kwd>
        <kwd>scientific workflows</kwd>
        <kwd>natural history specimens</kwd>
        <kwd>label data</kwd>
      </kwd-group>
      <funding-group>
        <award-group>
          <funding-source>
            <named-content content-type="funder_name">Horizon 2020</named-content>
            <named-content content-type="funder_identifier">501100007601</named-content>
            <named-content content-type="funder_doi">http://doi.org/10.13039/501100007601</named-content>
          </funding-source>
        </award-group>
      </funding-group>
      <counts>
        <fig-count count="13"/>
        <table-count count="11"/>
        <ref-count count="50"/>
      </counts>
    </article-meta>
    <notes>
      <sec sec-type="Funding program">
        <title>Funding program</title>
        <p>
          <ext-link ext-link-type="uri" xlink:href="https://cordis.europa.eu/programme/id/H2020-EU.1.4.1.1.">H2020-EU.1.4.1.1. - Developing new world-class research infrastructures</ext-link>
        </p>
      </sec>
    </notes>
  </front>
  <body>
    <sec sec-type="1.Introduction">
      <title>1.Introduction</title>
      <sec sec-type="1.1 Background">
        <title>1.1 Background</title>
        <p>We do not know how many specimens are held in the world's museums and herbaria. However, estimates of three billion seem reasonable (<xref ref-type="bibr" rid="B5930335">Wheeler et al. 2012</xref>). These specimens are irreplaceable and contribute to a diverse range of scientific fields (<xref ref-type="bibr" rid="B5930292">Suarez and Tsutsui 2004</xref>; <xref ref-type="bibr" rid="B5930240">Pyke and Ehrlich 2010</xref>). Their labels hold data on species distributions, scientific names, traits, people and habitats. Among those specimens are nomenclatural types that underpin the whole of formal taxonomy and define the species concept. These specimens span more than 200 years of biodiversity research and are an important source of data on species populations and environmental change. This enormous scientific legacy is largely locked into the typed or handwritten labels mounted with the specimen or in associated ledgers and field notebooks. It is a significant challenge to extract these data digitally, particularly without introducing errors. Furthermore, the provenance of these data must be maintained so that they can be verified against the original specimen.</p>
        <p>Perhaps the method most widely used today to extract these data from labels is for expert technicians to type the specimen details into a dedicated collection management system. They might, at the same time, georeference specimens where coordinates are not already provided on the specimen label. Volunteers have often been recruited to help with this process and, in some cases transcription has been outsourced to companies specializing in document transcription (<xref ref-type="bibr" rid="B5929923">Engledow et al. 2018</xref>; <xref ref-type="bibr" rid="B5929881">Ellwood et al. 2018</xref>).</p>
        <p>Nevertheless, human transcription of labels is slow and requires both skill to read the handwritten labels and knowledge of the names of places, people, and organisms. These labels are written in many languages often in the same collection and sometimes on the same label. Furthermore, abbreviations are frequently used and there is little standardisation on where each datum can be found on the label.</p>
        <p>Full or partial automation of this process is desirable to improve the speed and accuracy of data extraction and to reduce the associated costs. Automating even the simplest tasks such as triaging the labels by language or writing method (typed versus handwritten) stands to improve the overall efficiency of the human-in-the-loop approach. Optical Character Recognition (OCR) and Natural Language Processing (NLP) are two technologies that may support automation. OCR aims to convert images of text into a machine-readable format (<xref ref-type="bibr" rid="B5930139">Mori et al. 1999</xref>). NLP provides a range of methods for the interpretation of text by machine (<xref ref-type="bibr" rid="B5930066">Indurkhya and Damerau 2010</xref>).</p>
        <p>OCR and NLP proved effective for extracting data from biodiversity literature (<xref ref-type="bibr" rid="B5930309">Thessen et al. 2012</xref>; <xref ref-type="bibr" rid="B5930048">Hoehndorf et al. 2016</xref>). However, specimen labels pose additional problems compared to formally structured text such as that found in literature. The context of individual words is often difficult to determine. Specimens that overlap with the label may obscure some words. The orientation of labels typically varies. Typed and handwritten text may coexist within the same label and the handwriting on the same specimen may come from different people (Fig. <xref ref-type="fig" rid="F5930393">1</xref>). Therefore, the task of digitising the text found in specimen labels is far from simple and requires different approaches from standard text recognition.</p>
        <p>This paper examines the state of the art in automated text digitisation with respect to specimen images. The recommendations within are designed to enhance the digitisation and transcription pipelines that exist at partner institutions. They are also intended to provide guidance towards a proposed centralised specimen enrichment pipeline that could be created under a pan-European Research Infrastructure for biodiversity collections (<xref ref-type="bibr" rid="B5929855">DiSSCo 2020</xref>). This pipeline would provide state-of-the-art label digitisation services to institutions that need them.</p>
        <p>In this paper, we focus mainly on herbarium specimens, even though similar data extraction problems exist for pinned insects, liquid collections, and animal skins. Herbarium specimens are among the most difficult targets and we know from recent successful pilot studies for large-scale digitisation such as Herbadrop (<xref ref-type="bibr" rid="B5929932">EUDAT 2017</xref>) that they provide a good test of the technology. Furthermore, herbaria have been among the first to mass image their collections, so there is a vast number of specimen images available for testing.</p>
      </sec>
      <sec sec-type="1.2 Digitisation Workflow">
        <title>1.2 Digitisation Workflow</title>
        <p>We now outline a potential digitisation workflow, which is designed to process specimens and extract targeted data from them (Fig. <xref ref-type="fig" rid="F5930404">2</xref>). Starting with the original specimen, it is initially converted to a digital image. Though a digital object itself, the image does not immediately contain digitised text. In other words, though readable by humans, the image of the text is not yet searchable, i.e., encoded as a string of characters that can be processed by machine. The role of OCR is to convert text images into searchable text documents.</p>
        <p>To make these text documents searchable by the type of information that they contain, another layer of information (metadata) is required on top of the original text. This step requires deeper analysis of the textual content, which is performed using NLP including language identification, Named Entity Recognition (NER), and terminology extraction. The role of language identification here is twofold. If the labels are to be transcribed manually, then language identification can help us direct transcription tasks to the transcribers with suitable language skills. Similarly, if the labels were to be processed automatically, then the choice of tools will also depend on the given language.</p>
        <p>NER will support further structuring of the text by interpreting relevant portions of the text, such as those referring to people and locations. In addition to the extracted data and the associated metadata, the digitised collection should also incorporate a terminology that facilitates the interpretation of the scientific content described in the specimens. Many specimen labels contain either obscure or outdated terminology. Therefore, standard terminologies need to be supplemented by terminology extracted from the specimens.</p>
        <p>Finally, the performance of both OCR and NLP can be improved by restricting their view to only the labels on the specimen. This can be achieved by segmenting images prior to processing by identifying the areas of the image that relate to individual labels. However, there are trade-offs between the time it takes to segment images compared to the improved performance of OCR and NLP. In a production environment processing time is limited because of the need to ingest images into storage from a production line through a pipeline that includes quality control, the creation of image derivatives, and image processing.</p>
        <p>To help determine the subsequent steps in the pipeline it may be necessary to establish the language of the text recognised in the OCR step. This next step may be the deployment of language-specific NLP tools to identify useful information in the target specimen. Or it may be the channelling of the text for manual transcription. A number of software solutions exist for performing language identification and are explored in <ext-link ext-link-type="uri" xlink:href="#33_language_identification">section ‎3.3</ext-link>.</p>
        <p>An approach to automatic identification of data from OCR recognised text might include NER. This is an NLP task that identifies categories of information such as people and places. This approach may be suitable for finding a specimen's collector and collection country from text. <ext-link ext-link-type="uri" xlink:href="#34_named_entity_recognition">Section ‎3.4 </ext-link>investigates this possibility using an NER tool.</p>
      </sec>
      <sec sec-type="1.3 Project Context">
        <title>1.3 Project Context</title>
        <p>This project report was written as a formal Deliverable (D4.1) of the <ext-link ext-link-type="uri" xlink:href="https://icedig.eu/">ICEDIG Project</ext-link> and was previously made available on Zenodo (<xref ref-type="bibr" rid="B5930228">Owen et al. 2019</xref>) and submitted to the European Commision as a report. While the differences between these versions are minor the authors consider this the definitive version of the report.</p>
      </sec>
    </sec>
    <sec sec-type="2. Data">
      <title>2. Data</title>
      <sec sec-type="2.1 Data Collection">
        <title>2.1 Data Collection</title>
        <p>As noted above there is a large body of digitised herbarium specimens available for experimentation. A herbarium is a collection of pressed plant specimens and associated data (Fig. <xref ref-type="fig" rid="F5930393">1</xref><xref ref-type="fig" rid="F5930398">a</xref>). As indicated in Fig. <xref ref-type="fig" rid="F5930404">2</xref>, the first step in digitisation of these specimens is to produce a digital image. This requires physical manipulation of specimens, which is beyond the scope of the present task. Instead of gaining access to the original specimens, we collected their images in JPEG format from the partner institutions (<xref ref-type="bibr" rid="B5929838">Dillen et al. 2019</xref>). The choice of images sampled from these collections was based on the requirement to test OCR on a representative sample of the specimens in terms of their temporal and spatial coverage. This is because the age and origin of specimens may present different OCR challenges. For example, specimens can include printed, typed, or handwritten labels, which may be partially obscured or have different orientations.</p>
        <p>Each partner herbarium contributed 200 images containing a geographical and temporal cross-section of nomenclatural type and non-type herbarium specimens (Fig. <xref ref-type="fig" rid="F5930406">3</xref>). A type specimen is used to name a newly identified taxon.</p>
        <p>A total of nine herbaria, described in Table <xref ref-type="table" rid="T5930428">1</xref>, each contributed 200 specimen images giving a total of 1800 images, which formed a dataset for use in this study.</p>
      </sec>
      <sec sec-type="2.2 Data Properties">
        <title>2.2 Data Properties</title>
        <p>To illustrate the textual content of these images and to better understand the challenges posed to the OCR, Fig. <xref ref-type="fig" rid="F5930408">4</xref> provides an example of labels attached to a specimen shown in Fig. <xref ref-type="fig" rid="F5930393">1</xref><xref ref-type="fig" rid="F5930398">a</xref>. In general, the labels can contain the following information:</p>
        <p><list list-type="order">
          <list-item>
            <p><bold>Title</bold>: Organisation that owns the specimen.</p>
          </list-item>
          <list-item>
            <p><bold>Barcode</bold>: The specimen's machine readable identifier.</p>
          </list-item>
          <list-item>
            <p><bold>Species name</bold>: Scientific or common name of the species.</p>
          </list-item>
          <list-item>
            <p><bold>Determined by and date</bold>: The person who identified the specimen and the date of identification.</p>
          </list-item>
          <list-item>
            <p><bold>Locality</bold>: The geographical location where the specimen was collected.</p>
          </list-item>
          <list-item>
            <p><bold>Habitat and altitude</bold>: The habitat in which the specimen was collected and its altitude.</p>
          </list-item>
          <list-item>
            <p><bold>Notes</bold>: Additional notes written by the collector, often related to the characters of the species.</p>
          </list-item>
          <list-item>
            <p><bold>Collector name, specimen number, and collection date</bold>: The name of the person(s) who collected the specimen, the identifier that they used to record and manage specimens, and the date that the specimen was collected.</p>
          </list-item>
        </list></p>
        <p>The above list is non-exhaustive and more or less information may be recorded by the collector or determiner.</p>
        <p>The properties of textual content of the given herbarium have been extrapolated from a random sample of 10 specimens per institution (Table <xref ref-type="table" rid="T5930429">2</xref>).</p>
        <p>A subset of 250 images with labels written in English has been selected to test the performance of image segmentation and its effects on OCR and NER. For the purposes of these tests these images were manually divided into a total of 1,837 label segments, which were then processed separately. <xref ref-type="bibr" rid="B5998994">Nieva de la Hidalga et al. 2019</xref> discuss segmentation methods and results from the ICEDIG project.</p>
        <p>The segments effectively separate labels, barcodes, and colour charts. Examples can be seen in Fig. <xref ref-type="fig" rid="F5930410">5</xref>. Item 1 is a label containing the species name, the collection location, and the collector's name. Some of the information is printed while some of it is handwritten. In contrast, the label shown as Item 2 contains printed text only. However, its vertical orientation may cause additional difficulties. The label seen in Item 3 contains printed text that states the organisation that owns the specimen together with a barcode that identifies the specimen locally. However, the barcode stripes can sometimes be misinterpreted as text by overzealous OCR software. A colour chart, such as the one shown in Item 4, contains no text, so it does not need to be processed further. Finally, Item 5 presents a ruler, which is accompanied by text that is not specific to the specimen and therefore does not need to be considered. A machine learning classifier can be trained on segmented images to differentiate between different classes of labels in order to triage them ahead of the subsequent steps in the digitisation workflow.</p>
      </sec>
      <sec sec-type="2.3 Metadata">
        <title>2.3 Metadata</title>
        <p>The role of OCR is to convert image text into searchable text. To make this text searchable by the type of information that they contain, another layer of information (metadata) is required on top of the original text. We can differentiate between three different types of metadata (<xref ref-type="bibr" rid="B5930249">Riley 2017</xref>):</p>
        <p><list list-type="order">
          <list-item>
            <p><italic>Descriptive</italic> metadata facilitate searching using descriptors that qualify their content. For example, digitised specimens can be accessed by a species name, its collection location, or its collector.</p>
          </list-item>
          <list-item>
            <p><italic>Structural</italic> metadata describe how the components of the data object are organised thereby facilitating navigation through its content. For example, labelling each segment of a digitised specimen by its type can facilitate their management. As shown in Fig. <xref ref-type="fig" rid="F5930410">5</xref>, segment types include colour chart, ruler, barcode, collector's label, and determination.</p>
          </list-item>
          <list-item>
            <p><italic>Administrative</italic> metadata convey technical information that can be used to manage data objects. Examples include time of creation, digital format, and software used.</p>
          </list-item>
        </list></p>
        <p>While metadata can take many forms, it is important to comply with a common standard to improve accessibility to the data. Darwin Core (<xref ref-type="bibr" rid="B5930379">Wieczorek et al. 2012</xref>) is one such standard maintained by the Darwin Core Maintenance Group of the Biodiversity Information Standards organisation (TDWG). It includes a glossary of terms intended to facilitate the sharing of information on biological diversity by providing global identifiers, labels, and definitions. Darwin Core is primarily based on taxa, their occurrence in nature as documented by observations, specimens, samples, and related information. Fig. <xref ref-type="fig" rid="F5930412">6</xref> shows how the text content of the specimen shown in Fig. <xref ref-type="fig" rid="F5930408">4</xref> could be structured using Darwin Core standard, version 2014 (<xref ref-type="bibr" rid="B5929829">Darwin Core Maintenance Group, Biodiversity Information Standards (TDWG) 2014</xref>; <xref ref-type="bibr" rid="B5929813">Biodiversity Information Standards (TDWG) 2020</xref>). Once structured, the data can be stored in a database allowing for complex queries and efficient retrieval. For example, the geographic coordinates can be used to retrieve data referring to specimens collected within a given radius, which may be further restricted by a time period, institution, species, etc.</p>
        <p>The problem of populating a predefined template such as the one defined by Darwin Core with information found in free text is an area of NLP known as Information Extraction (IE) (<xref ref-type="bibr" rid="B5929863">Doleschal et al. 2020</xref>). The complexity of the template usually requires a bespoke IE system to be developed, which is beyond the scope of this feasibility study. Therefore, we will be focusing on information that could be extracted using NER, a subtask of IE, which can be supported using off-the-shelf software. Here, we focus on two commonly used named entities, namely location and person names. A specimen's country and collector name are the two most useful OCR output fields for triaging specimens before downstream manual transcription (<xref ref-type="bibr" rid="B5929872">Drinkwater et al. 2014</xref>).</p>
      </sec>
    </sec>
    <sec sec-type="3. Digitisation Experiments">
      <title>3. Digitisation Experiments</title>
      <p>This section describes a selection of software tools that can be used to automate the steps of the digitisation workflow shown in Fig. <xref ref-type="fig" rid="F5930404">2</xref> together with the test results obtained using the data described in <ext-link ext-link-type="uri" xlink:href="#2_data">section ‎2</ext-link>.</p>
      <sec sec-type="3.1 Optical Character Recognition">
        <title>3.1 Optical Character Recognition</title>
        <p>OCR is a technology that allows the automatic recognition of characters through an optical mechanism or computer software (<xref ref-type="bibr" rid="B5930139">Mori et al. 1999</xref>). OCR can be used to convert image-borne characters to text documents that are machine readable in the sense that the text can then be indexed, searched, edited, or processed by NLP software.</p>
        <p>We tested three off-the-shelf OCR software tools, described in Table <xref ref-type="table" rid="T5930430">3</xref>. Tesseract is reportedly the most accurate open-source OCR software with respect to the task of extracting text from specimen labels (<xref ref-type="bibr" rid="B5930021">Haston et al. 2015</xref>). Its development is sponsored by Google (<xref ref-type="bibr" rid="B5929956">Google Open Source 2018</xref>) and it has the native ability to recognise more than 100 languages. We originally considered version 3.0.51 of Tesseract, but later extended our experiments to version 4.0.0, which was released in the meantime and was reported to offer significantly higher accuracy than its earlier version (<xref ref-type="bibr" rid="B5930220">Ooms 2018</xref>). The software development kit ABBYY FineReader Engine 12.0 allows software developers to integrate OCR functionality into their applications to extract textual information from paper documents, images, or displays (<xref ref-type="bibr" rid="B5929764">ABBYY 2018</xref>).</p>
        <p>Microsoft's OneNote is a note-taking and management application for collecting, organising, and sharing digital information (<xref ref-type="bibr" rid="B5930131">Microsoft Corporation 2018</xref>). It contains native OCR functionality whose performance had not been evaluated in another recent investigation into automating data capture from natural history specimens (<xref ref-type="bibr" rid="B5930021">Haston et al. 2015</xref>). Unlike Tesseract and ABBYY FineReader Engine, OneNote is a stand-alone software application whose OCR functionality cannot readily be integrated into other software.</p>
        <p>To evaluate the OCR performance of the aforementioned software tools, we ran two sets of experiments, one against the whole digital images of specimens and the other against the segmented images with an expectation that the latter would result in shorter processing time and higher accuracy. Indeed, the results shown in Table <xref ref-type="table" rid="T5930431">4</xref> demonstrate that the processing time was reduced by 49% on average when images were segmented prior to undergoing OCR. Out of the three batch processing software tools considered, Tesseract 3.0.51 was the fastest in both scenarios. All experiments were performed using the following configuration: a desktop computer containing an Intel i5-4590T 2.00GHz 4 Core CPU (Central Processing Unit), 8.00 GB RAM (Gigabytes of Random Access Memory) and Microsoft Windows 10 Education Version 10.0.17134.</p>
        <p>The accuracy of OCR will be measured in terms of line correctness as described by <xref ref-type="bibr" rid="B5930021">Haston et al. 2015</xref>. To create a gold standard, the text from a digital image is manually transcribed verbatim and the number of original lines counted. The lines from the OCR output are then compared against the gold standard and classified into one of three classes: correct, partially (in)correct and incorrect and scored 1, 0.5, and 0, respectively. An example can be seen in Fig. <xref ref-type="fig" rid="F5930414">7</xref>. The line scores are then aggregated into overall accuracy. This method considers only printed text and not handwritten text.</p>
        <p>Bearing in mind the time and effort involved in creating the gold standard, only a subset of the dataset (250 specimen images and their segments) available for testing was used to evaluate the correctness of the OCR. Five herbarium sheet images, their segments and manual transcriptions, and OCR text used in these experiments can be found in Section 2 of Suppl. material <xref ref-type="supplementary-material" rid="S5930392">1</xref>. A summary of results is given in Table <xref ref-type="table" rid="T5930432">5</xref>.</p>
        <p>Apart from ABBYY FineReader Engine all other tools recorded an accuracy around 70%, with Tesseract 4.0.0 proving to be the most robust with respect to image segmentation. Its performance could be improved by further experiments focusing on its configuration parameters.</p>
      </sec>
      <sec sec-type="3.2 Handwritten Text Recognition">
        <title>3.2 Handwritten Text Recognition</title>
        <p>As mentioned in <ext-link ext-link-type="uri" xlink:href="#11_background">section 1.1</ext-link>, not all specimen labels bear printed text. A huge volume of specimen labels bear handwritten text in place of or in addition to printed text. Similar to using OCR to automatically read printed specimen labels, we can use Handwritten Text Recognition (HTR) to automatically read handwritten specimen labels. HTR is described as the task of transcribing handwritten text into digital text (<xref ref-type="bibr" rid="B5930257">Scheidl 2018</xref>).</p>
        <p>ABBYY FineReader Engine 12.0 and Google Cloud Vision OCR v1 (<xref ref-type="bibr" rid="B5929948">Google Cloud 2018</xref>) are both capable of performing HTR. Google Cloud Vision currently supports 56 languages. Its language settings can be adjusted to improve speed and accuracy of the text recognition. It is a paid service and has a limit of 20MB and 20M pixels per image submitted to it for processing.</p>
        <p>We performed an experiment to measure the HTR performance of both ABBYY FineReader Engine and Google Cloud Vision with respect to handwritten specimen labels. The five specimen whole images used in <ext-link ext-link-type="uri" xlink:href="#31_optical_character_recognition">section 3.1</ext-link> were reused in this experiment. These whole images, each of which bear handwritten text, were submitted to ABBYY FineReader Engine and Google Cloud Vision to undergo HTR.</p>
        <p>The HTR results from ABBYY FineReader Engine and Google Cloud Vision were compared against the gold standard for each specimen image using Levenshtein distance (<xref ref-type="bibr" rid="B5930083">Levenshtein 1966</xref>). The Levenshtein distance measures the minimum difference between two strings by counting the number of insertions, deletions, and substitutions needed to change one string into the other. Note that this metric is not case sensitive. Every field from the test data set was compared to the text obtained through OCR.</p>
        <p>One must be cautious when comparing interpreted gold standard data. For example, where the catalog number is "BM000521570" Google Cloud Vision finds "000521570 (BM)". Technically, Google Cloud Vision has found the correct string, but because the gold standard contains an interpreted value it appears that Google Cloud Vision is not correct. Another example concerns the fact that the gold standard contains fields that use abbreviations, such as country codes. This means that "Australia" and its country code "AU" will rightly be considered identical.</p>
        <p>Specific fields were identified for HTR analysis: catalogNumber, genus, specificEpithet, country, recordedBy, typeStatus, verbatimLocality, verbatimRecordedBy. Verbatim coordinates are likely too complex or too often open to interpretation to be compared reliably in this analysis. For example, verbatimEventDate was ignored because it is not technically verbatim; it may be written “3/8/59” on a specimen label, but recorded as “1959-08-03” in a specimen database (<xref ref-type="bibr" rid="B5929940">Finnish Biodiversity Info Facility 2018</xref>). Year was therefore used instead, although we acknowledge that this is not as precise or as informative as a complete date. We acknowledged this limitation in our analysis; when comparing Years we insisted that Levenshtein distance considered them identical for them to be deemed a match. All Levenshtein distances between two Years that were greater than 0 (meaning not identical) were therefore omitted from further analysis.</p>
        <p>Note that typeStatus is not always present in a specimen image. It is therefore often inferred based on other data that is present. It was nevertheless included in the analysis because of its importance in biodiversity taxonomy.</p>
        <p>Fig. <xref ref-type="fig" rid="F5930416">8</xref> shows the count of Levenshtein distance scores for all selected fields combined, Lev<sub>year</sub>&gt;0 excluded. Google Cloud Vision scores better. The high count of results with a distance greater than 4 (indicating large dissimilarity) is partly due to certain fields being interpreted. Such fields might include typeStatus.</p>
        <p>Examining the results in Fig. <xref ref-type="fig" rid="F5930416">8</xref> it shows that the Google Cloud Vision scores are higher for the three best distances. Comparing the results in Fig. <xref ref-type="fig" rid="F5930418">9</xref> and Fig. <xref ref-type="fig" rid="F5930420">10</xref> show that Google Cloud Vision has more results in the best category for each field, while ABBYY FineReader Engine has a higher count of Lev≥4 for each field. Distances greater than 4 can be considered low quality results. When Lev≥4 and Lev<sub>year</sub>&gt;0 results are excluded, Google Cloud Vision obtained 1133 results while ABBYY FineReader Engine obtained 809. When the results are weighted for accuracy (5 for distance=0, 1 for distance≥4, Lev<sub>year</sub>&gt;0 excluded) Google Cloud Vision scored 6540 while ABBYY FineReader Engine scored 4689.</p>
        <p>In conclusion, this comparative test indicates that the results from Google Cloud Vision are of higher quality than ABBYY FineReader Engine. The results are of even higher quality when the lowest scoring categories are excluded. These results demonstrate that HTR can be used to retrieve a considerable volume of data of high quality. HTR should no longer be dismissed as ineffective because it has already become a viable technique.</p>
      </sec>
      <sec sec-type="3.3 Language Identification">
        <title>3.3 Language Identification</title>
        <p>Language identification is the task of determining the natural language that a document is written in. It is a key step in automatic processing of real-world data where a multitude of languages exist (<xref ref-type="bibr" rid="B5930117">Lui and Baldwin 2012</xref>). Languages used on specimen labels can vary across a collection as can be seen in Fig. <xref ref-type="fig" rid="F5930422">11</xref>. In the context of digitisation workflows knowing the languages that specimen labels are written in allows us to inform the subsequent steps, including NLP. It also offers anopportunity to improve manual curation of the results by being able to forward them to people with the required language skills.</p>
        <p>A number of off-the-shelf software tools can be used to perform language identification, examples of which can be seen in Table <xref ref-type="table" rid="T5930433">6</xref>. The given tools can all be integrated into larger software applications.</p>
        <p>Table <xref ref-type="table" rid="T5930434">7</xref> provides output obtained by langid.py from a sample of our test data. The automatically identified language is quantified with a probability estimate. The given library is able to identify 97 different languages without requiring any special configuration. It generally outperforms langdetect (<xref ref-type="bibr" rid="B5929821">Danilák 2018</xref>) in terms of accuracy. In addition, langid.py is reportedly the faster of the two (<xref ref-type="bibr" rid="B5930117">Lui and Baldwin 2012</xref>). The corpus used in the evaluation contained government documents, online encyclopaedia entries, and software documentation (<xref ref-type="bibr" rid="B5930117">Lui and Baldwin 2012</xref>; <xref ref-type="bibr" rid="B5929783">Baldwin and Lui 2010</xref>).</p>
        <p>The program language-detection (<xref ref-type="bibr" rid="B5930265">Shuyo 2014</xref>) provides a third option for language detection. Unlike langid.py and langdetect no evaluation of its performance appears to have been published. It advertises 99% precision over 53 languages although texts of 10 to 20 words are recommended to support accurate detection. This may prove problematic when used with short fragments of OCR text obtained from specimen images.</p>
      </sec>
      <sec sec-type="3.4 Named Entity Recognition">
        <title>3.4 Named Entity Recognition</title>
        <p>NER is commonly used in information extraction to identify text segments that refer to entities from predefined categories (<xref ref-type="bibr" rid="B5930147">Nadeau and Sekine 2009</xref>). The state-of-the-art approach is to use conditional random fields trained on data manually labelled with these categories to learn automatically how to extract named entities from text. Traditionally, these categories include persons, organisations, and locations. Therefore, pre-trained models for these categories are readily available. For instance, Stanford NER (<xref ref-type="bibr" rid="B5930318">The Stanford Natural Language Processing Group 2018</xref>) provides such models.</p>
        <p>As mentioned in <ext-link ext-link-type="uri" xlink:href="#23_metadata">section 2.3</ext-link>, in this study we are interested in two categories of named entity: country (part of the location) and collector (a specific person). Pre-trained NER software can only identify names of locations and persons, but cannot verify that a location is a country or that a person is a collector. Therefore, we will generalise our NER problem into that of recognising persons and locations in general and will accordingly measure the performance of Stanford NER on our dataset. A subset of specimen labels were manually transcribed and annotated with person and location labels to create a gold standard against which to evaluate Stanford NER. Fig. <xref ref-type="fig" rid="F5930424">12</xref> shows a specimen label. Fig. <xref ref-type="fig" rid="F5930426">13</xref> shows the results of both manual transcription and NER with respect to that specimen label.</p>
        <p>According to <xref ref-type="bibr" rid="B5930074">Jiang et al. 2016</xref> a named entity is recognised correctly if either of the following criteria is met:</p>
        <p><list list-type="order">
          <list-item>
            <p>Both boundaries of a named entity and its type match. For example, the segment “Ilkka Kukkonen” in Fig. <xref ref-type="fig" rid="F5930426">13</xref> is recognised fully and correctly as a person.</p>
          </list-item>
          <list-item>
            <p>Two text segments overlap partially and match on the type.</p>
          </list-item>
        </list></p>
        <p>Either way, the NER results are usually evaluated using the three most commonly used measures in NLP: precision, recall, and F1 score. In the context of NER, precision is the fraction of automatically recognised entities that are correct, whereas recall is the fraction of manually annotated named entities that were successfully recognised by the NER system. F1 score is a measure that combines precision and recall - it is the harmonic mean of the two.</p>
        <p>Table <xref ref-type="table" rid="T5930435">8</xref> and the formulae below show how these might be calculated. An example follows that explains the terms used.</p>
        <p><bold>Formulae for Precision, Recall, and F1 Score</bold>:</p>
        <p>
          <tex-math id="M1"><![CDATA[\documentclass[12pt]{standalone}
\usepackage{varwidth}

\usepackage[utf8x]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{lmodern}

\usepackage{amsmath, amssymb, graphics, setspace}
\newcommand{\mathsym}[1]{{}}
\newcommand{\unicode}[1]{{}}
\newcounter{mathematicapage}
\begin{document}
   \begin{varwidth}{50in}
        \begin{equation*}
            Precision = {\text{True Positive} \over \text{True Positive} + \text{False Positive}}
        \end{equation*}
    \end{varwidth}
\end{document}
]]></tex-math>
        </p>
        <p>
          <tex-math id="M2"><![CDATA[\documentclass[12pt]{standalone}
\usepackage{varwidth}

\usepackage[utf8x]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{lmodern}

\usepackage{amsmath, amssymb, graphics, setspace}
\newcommand{\mathsym}[1]{{}}
\newcommand{\unicode}[1]{{}}
\newcounter{mathematicapage}
\begin{document}
   \begin{varwidth}{50in}
        \begin{equation*}
            Recall = {\text{True Positive} \over \text{True Positive} + \text{False Negative}}
        \end{equation*}
    \end{varwidth}
\end{document}
]]></tex-math>
        </p>
        <p>
          <tex-math id="M3"><![CDATA[\documentclass[12pt]{standalone}
\usepackage{varwidth}

\usepackage[utf8x]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{lmodern}

\usepackage{amsmath, amssymb, graphics, setspace}
\newcommand{\mathsym}[1]{{}}
\newcommand{\unicode}[1]{{}}
\newcounter{mathematicapage}
\begin{document}
   \begin{varwidth}{50in}
        \begin{equation*}
            F1Score= 2*{\text{Precision}*\text{Recall} \over \text{Precision} + \text{Recall}}
        \end{equation*}
    \end{varwidth}
\end{document}
]]></tex-math>
        </p>
        <p>To evaluate the performance of NER on our dataset, we selected a subset of five herbarium sheet images and their segments, which are to be found in Section 3 of Suppl. material <xref ref-type="supplementary-material" rid="S5930392">1</xref>. These are the same images and segments used to calculate line correctness in <ext-link ext-link-type="uri" xlink:href="#31_optical_character_recognition">section ‎3.1</ext-link>. The OCR output used is that obtained using Tesseract 4.0.0.</p>
        <p>Table <xref ref-type="table" rid="T5930436">9</xref> and Table <xref ref-type="table" rid="T5930437">10</xref> show the results of Stanford NER performance.</p>
        <p>An improvement across all measures can be observed when using OCR text from segmented images. This is consistent with the increased line correctness observed described in <ext-link ext-link-type="uri" xlink:href="#31_optical_character_recognition">section ‎3.1</ext-link>.</p>
      </sec>
      <sec sec-type="3.5 Terminology Extraction">
        <title>3.5 Terminology Extraction</title>
        <p>To improve the accessibility of a specimen collection, its content needs to be not only digitised but also organised in alphabetical or some other systematic order. This is naturally expected to be done by species name. The problem with old specimens is that the content of their labels is not likely to comply with today's standards. Therefore, matching them against existing taxonomies will fail to recognise non-standard terminology. To automatically extract species names together with other relevant terminology, we propose an unsupervised data-driven approach to terminology extraction. FlexiTerm is a method developed in-house at Cardiff University. It has been designed to automatically extract multi-word terms from a domain-specific corpus of text documents (<xref ref-type="bibr" rid="B5930273">Spasić et al. 2013</xref>; <xref ref-type="bibr" rid="B5930283">Spasić 2018</xref>).</p>
        <p>OCR text extracted from specimens in a given herbarium fits a description of a domain-specific corpus; therefore FlexiTerm can exploit linguistic and statistical patterns of language use within a specific herbarium to automatically extract relevant terminology. Section 4 of Suppl. material <xref ref-type="supplementary-material" rid="S5930392">1</xref> shows the multi-word terms extracted from the text recognised using Tesseract 4.0.0 on the segmented images. The results show that the majority of extracted terminology refers to organisations (herbaria) that host the specimens, such as “Royal Botanic Gardens Edinburgh” or “Nationaal Herbarium Nederland”. There are also mentions of collectors, such as “Ilkka Kukkonen” that were also recognised as persons by NER. In that respect, there is some overlap between NER and terminology extraction. Regardless of their type, the multi-word terms extracted by FlexiTerm will represent the longest repetitive phrases found in a collection. Therefore, their recognition can facilitate transcription or curation of a digital collection should these activities be crowdsourced.</p>
      </sec>
    </sec>
    <sec sec-type="4. Putting It All Together">
      <title>4. Putting It All Together</title>
      <p>Many scientific disciplines are increasingly data driven and new scientific knowledge is often gained by scientists putting together data analysis and knowledge discovery “pipelines” (<xref ref-type="bibr" rid="B5930103">Ludäscher et al. 2006</xref>). These “pipelines” are known as scientific workflows. Interpreting data and attaching meaning to it creates information. Interpreting information in the context of prior knowledge, experience and wisdom can lead to new knowledge.</p>
      <p>A scientific workflow consists of a series of analytical steps. These can involve data discovery and access, data analysis, modelling and simulation, and data mining. Steps can be computationally intensive and therefore are often carried out on high‐performance computing clusters. Herbadrop, a pilot study of specimen digitisation using OCR, demonstrated successful use of high performance digital workflows (<xref ref-type="bibr" rid="B5929932">EUDAT 2017</xref>). In this section, we review workflow management systems that can be used to automate the workflow presented in Fig. <xref ref-type="fig" rid="F5930404">2</xref>.</p>
      <p>The tools that allow scientists to compose and execute scientific workflows are generally known as workflow management systems, of which <ext-link ext-link-type="uri" xlink:href="https://taverna.incubator.apache.org/">Apache Taverna</ext-link> and <ext-link ext-link-type="uri" xlink:href="https://kepler-project.org/">Kepler</ext-link> are among the most well-known and best established examples.</p>
      <p>Apache Taverna is open-source and domain-independent (<xref ref-type="bibr" rid="B5930301">The Apache Software Foundation 2018</xref>). It is designed for use in any scientific discipline and is supported by a large community of users.</p>
      <p>Taverna was successfully deployed within the domain of biodiversity via BioVeL - a virtual laboratory for data analysis and modelling in biodiversity (<xref ref-type="bibr" rid="B5929964">Hardisty et al. 2016</xref>). BioVeL allowed the building of workflows through the selection of a series of data processing services and could process large volumes of data even when the services needed to do that are distributed among multiple service providers.</p>
      <p>Taverna supported BioVeL users by allowing them to create workflows via a visual interface as opposed to writing code. Users were presented with a selection of processing steps and can “drag and drop” them to create a workflow. They could then test the workflow by running it on their desktop machine before deploying it to more powerful computing resources.</p>
      <p>Kepler is a scientific workflow application also designed for creating, executing and sharing analyses across a broad range of scientific disciplines (<xref ref-type="bibr" rid="B5929772">Altintas et al. 2004</xref>). Application areas include bioinformatics, particle physics and ecology.</p>
      <p>Like Taverna, Kepler provides a graphical user interface to aid in the selection of analytical components to form scientific workflows (<xref ref-type="bibr" rid="B5929797">Barseghian et al. 2010</xref>). It also offers data provenance features that allow users to examine workflow output in detail for diagnostic purposes (<xref ref-type="bibr" rid="B5930092">Liew et al. 2016</xref>). This supports the reliability and reproducibility of evidence from data, which is necessary for the presentation of conclusions in research publications.</p>
      <p>Tools like Apache Taverna and Kepler can be used for creating workflows for OCR, NER, and IE, like that depicted in Fig. <xref ref-type="fig" rid="F5930404">2</xref>. When managed and executed in virtual research environments such as BioVeL, the data and results can be collated, managed, and shared appropriately. Such workflows can be run repeatedly, reliably, and efficiently with the possibility to process many tens of thousands of label images in parallel within a single workflow run.</p>
    </sec>
    <sec sec-type="5. Conclusions">
      <title>5. Conclusions</title>
      <p>We designed a modular approach for automated text digitisation with respect to specimen labels (Fig. <xref ref-type="fig" rid="F5930393">1</xref>). To minimise implementation overhead, we proposed implementing this approach as a scientific workflow using off-the-shelf software to support individual components. An additional advantage of this approach is an opportunity to run the workflow in a distributed environment, thus supporting large-scale digitisation as well as an optimal use of resources across multiple institutions. Based on the local experience and expertise associated with both development and applications, we recommend the use of Apache Taverna for implementing and executing the workflow. We evaluated off-the-shelf software that can support specific modules within the workflow. Our recommendations are summarised in Table <xref ref-type="table" rid="T5930438">11</xref>. Further research is needed with respect to image segmentation, which has been shown to have significant effect on the performance across all tasks listed in Table <xref ref-type="table" rid="T5930438">11</xref>.</p>
    </sec>
    <sec sec-type="6. Appendices">
      <title>6. Appendices</title>
      <p>For the sake of brevity the appendices can be found in the supplementary document "<ext-link ext-link-type="uri" xlink:href="#supplementary_files">Appendices</ext-link>". The document contains the following principal information concerning the Digitisation Experiments:</p>
      <p><list list-type="bullet">
        <list-item>
          <p>OCR Software Settings</p>
        </list-item>
        <list-item>
          <p>OCR Line Correctness Analysis Data</p>
        </list-item>
        <list-item>
          <p>NER Analysis Data</p>
        </list-item>
        <list-item>
          <p>Non-standard Terminology Extraction Analysis Data</p>
        </list-item>
      </list></p>
    </sec>
    <sec sec-type="7. Glossary">
      <title>7. Glossary</title>
      <p><list list-type="bullet">
        <list-item>
          <p><bold>Automated text digitisation</bold> - The process of converting written text to a machine-readable format, that allows text to become searchable. In biodiversity, documents can typically include printed or handwritten specimen labels.</p>
        </list-item>
        <list-item>
          <p><bold>Conditional Random Field</bold> - A machine learning method for structural pattern recognition; in particular, sequence labelling. For example, an unnamed image containing part of a leaf can appear in a sequence of plant specimen images. A machine may be able to determine that the leaf belongs to a "deciduous holly" if a named image of that plant neighbours the leaf image in the sequence.</p>
        </list-item>
        <list-item>
          <p><bold>Deep learning</bold> - A type of machine learning based on neural networks. It is widely used in both image processing and natural language processing to support end-to-end learning by simultaneously training all parameters and representing them by a single model. This makes manual feature engineering redundant.</p>
        </list-item>
        <list-item>
          <p><bold>Gold standard</bold> - A dataset used to evaluate a computational model. The gold standard is often produced by manual data annotation. In the task of automated text digitisation of a specimen label a human transcribes the label. This forms a reference against which the model to digitise the labels automatically can be tested.</p>
        </list-item>
        <list-item>
          <p><bold>Handwritten Text Recognition (HTR)</bold> - Automated digitisation of hand-written text.</p>
        </list-item>
        <list-item>
          <p><bold>High performance computing cluster</bold> - This approach to computing involves multiple co-located computer processors working alongside one another in parallel to complete a task.</p>
        </list-item>
        <list-item>
          <p><bold>Information Extraction (IE)</bold> - The task of extracting information from unstructured text into a predefined template. For example, information contained in a specimen label can be extracted and structured into a Darwin Core record.</p>
        </list-item>
        <list-item>
          <p><bold>JPEG</bold> - A compressed format for computer image files, designed to make them easy to store and to send between computers.</p>
        </list-item>
        <list-item>
          <p><bold>Language identification</bold> - The task of automatically classifying a natural language a document is written in i.e., English, Spanish, etc.</p>
        </list-item>
        <list-item>
          <p><bold>Machine learning</bold> - The process of generalising available data into a computational model that can then be used to make inferences on unseen data. For example, a computer may have learnt that leaves of the holly species of plant contain several pointed ends if it has observed many such images in the past. If the computer later sees an image of a rounded leaf it may determine that the leaf is unlikely to be the holly species.</p>
        </list-item>
        <list-item>
          <p><bold>Metadata</bold> - Typically described as data about data. Metadata consist of structured information that describes, explains, locates or otherwise makes it easier to find, access and use the underlying data. A digital photograph of a plant specimen is data. This photograph may be accompanied by additional information such as the date and time the photograph was taken, the name of the camera used, and the resolution of the image. This is metadata.</p>
        </list-item>
        <list-item>
          <p><bold>Named Entity Recognition (NER)</bold> - A subtask of information extraction focusing on named entities, such as persons, countries, cities and organisations.</p>
        </list-item>
        <list-item>
          <p><bold>Natural Language Processing (NLP)</bold> - A wide range of tasks and methods used to automatically analyse information expressed in a natural language.</p>
        </list-item>
        <list-item>
          <p><bold>Optical Character Recognition (OCR)</bold> - The process of converting images of text, such as a photograph of a specimen label into a machine-readable format.</p>
        </list-item>
        <list-item>
          <p><bold>Scientific workflow</bold> - The description of a process in terms of a sequence of steps (tasks and sub-tasks) that must be completed, generally with computer assistance to meet some research goal. A workflow might include the digitisation, acquisition, and curation of specimen label data using a sequence of steps that involves OCR and NLP methods.</p>
        </list-item>
      </list></p>
    </sec>
  </body>
  <back>
    <sec sec-type="Funding program">
      <title>Funding program</title>
      <p>
        <ext-link ext-link-type="uri" xlink:href="https://cordis.europa.eu/programme/id/H2020-EU.1.4.1.1.">H2020-EU.1.4.1.1. - Developing new world-class research infrastructures</ext-link>
      </p>
    </sec>
    <sec sec-type="Grant title">
      <title>Grant title</title>
      <p><ext-link ext-link-type="uri" xlink:href="https://cordis.europa.eu/project/id/777483?rcn=4256">ICEDIG</ext-link> – “Innovation and consolidation for large scale digitisation of natural heritage”, Grant Agreement No. 777483</p>
    </sec>
    <sec sec-type="Author contributions">
      <title>Author contributions</title>
      <p>
        <bold>Authors</bold>
      </p>
      <p><bold>David Owen</bold>: Data Curation, Formal Analysis, Methodology, Software, Writing - Original Draft. <bold>Quentin Groom</bold>: Funding acquisition, Resources, Writing - Original Draft, Supervision. <bold>Alex Hardisty</bold>: Funding acquisition, Supervision, Writing - Original Draft. <bold>Thijs Leegwater</bold>: Formal analysis, Methodology. <bold>Laurence Livermore</bold>: Validation, Writing - review and editing. <bold>Myriam van Walsum</bold>: Formal analysis, Methodology, Writing - Original Draft. <bold>Noortje Wijkamp</bold>: Formal analysis, Methodology. <bold>Irena Spasić</bold>: Conceptualisation, Methodology, Funding acquisition, Supervision, Writing - Original Draft.</p>
      <p>
        <bold>Contributors</bold>
      </p>
      <p><bold>Mathias Dillen</bold>: Resources, Visualisation. <bold>Sarah Phillips</bold>: Methodology, Resources. <bold>Zhengzhe Wu</bold>: Resources.</p>
      <p>Contribution types are drawn from CRediT - <ext-link ext-link-type="uri" xlink:href="https://casrai.org/credit/">Contributor Roles Taxonomy</ext-link>.</p>
    </sec>
    <ref-list>
      <title>References</title>
      <ref id="B5929764">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>ABBYY</surname>
            </name>
          </person-group>
          <article-title>AI-powered OCR SDK for Windows, Linux &amp; Mac OS | ABBYY OCR API</article-title>
          <uri>https://www.abbyy.com/en-gb/ocr-sdk</uri>
          <date-in-citation content-type="access-date">2018-11-21T00:00:00+02:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5929772">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Altintas</surname>
              <given-names>I.</given-names>
            </name>
            <name name-style="western">
              <surname>Berkley</surname>
              <given-names>C.</given-names>
            </name>
            <name name-style="western">
              <surname>Jaeger</surname>
              <given-names>E.</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>M.</given-names>
            </name>
            <name name-style="western">
              <surname>Ludascher</surname>
              <given-names>B.</given-names>
            </name>
            <name name-style="western">
              <surname>Mock</surname>
              <given-names>S.</given-names>
            </name>
          </person-group>
          <year>2004</year>
          <article-title>Kepler: an extensible system for design and execution of scientific workflows</article-title>
          <source>Proceedings. 16th International Conference on Scientific and Statistical Database Management, 2004.</source>
          <fpage>423</fpage>
          <lpage>424</lpage>
          <pub-id pub-id-type="doi">10.1109/ssdm.2004.1311241</pub-id>
        </element-citation>
      </ref>
      <ref id="B5929783">
        <element-citation publication-type="conference-paper">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Baldwin</surname>
              <given-names>Timothy</given-names>
            </name>
            <name name-style="western">
              <surname>Lui</surname>
              <given-names>Marco</given-names>
            </name>
          </person-group>
          <year>2010</year>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Linguistics</surname>
              <given-names>Association for Computational</given-names>
            </name>
          </person-group>
          <article-title>
            <italic>Language identification: the long and the short of the matter</italic>
          </article-title>
          <source>
            <italic>Human Language Technologies: The 2010 Annual Conference of the North American Chapter of the Association for Computational</italic>
          </source>
          <conf-loc>Los Angeles, California</conf-loc>
          <uri>https://www.aclweb.org/anthology/N10-1027</uri>
        </element-citation>
      </ref>
      <ref id="B5929797">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Barseghian</surname>
              <given-names>Derik</given-names>
            </name>
            <name name-style="western">
              <surname>Altintas</surname>
              <given-names>Ilkay</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>Matthew B.</given-names>
            </name>
            <name name-style="western">
              <surname>Crawl</surname>
              <given-names>Daniel</given-names>
            </name>
            <name name-style="western">
              <surname>Potter</surname>
              <given-names>Nathan</given-names>
            </name>
            <name name-style="western">
              <surname>Gallagher</surname>
              <given-names>James</given-names>
            </name>
            <name name-style="western">
              <surname>Cornillon</surname>
              <given-names>Peter</given-names>
            </name>
            <name name-style="western">
              <surname>Schildhauer</surname>
              <given-names>Mark</given-names>
            </name>
            <name name-style="western">
              <surname>Borer</surname>
              <given-names>Elizabeth T.</given-names>
            </name>
            <name name-style="western">
              <surname>Seabloom</surname>
              <given-names>Eric W.</given-names>
            </name>
            <name name-style="western">
              <surname>Hosseini</surname>
              <given-names>Parviez R.</given-names>
            </name>
          </person-group>
          <year>2010</year>
          <article-title>Workflows and extensions to the Kepler scientific workflow system to support environmental sensor data access and analysis</article-title>
          <source>Ecological Informatics</source>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>42</fpage>
          <lpage>50</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ecoinf.2009.08.008</pub-id>
        </element-citation>
      </ref>
      <ref id="B5929813">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>(TDWG)</surname>
              <given-names>Biodiversity Information Standards</given-names>
            </name>
          </person-group>
          <article-title>Darwin Core</article-title>
          <uri>https://dwc.tdwg.org/</uri>
          <date-in-citation content-type="access-date">2020-06-05T00:00:00+03:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5929821">
        <element-citation publication-type="software">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Danilák</surname>
              <given-names>M.</given-names>
            </name>
          </person-group>
          <year>2018</year>
          <article-title>
            <italic>langdetect</italic>
          </article-title>
          <publisher-name>GitHub</publisher-name>
          <uri>https://github.com/Mimino666/langdetect</uri>
        </element-citation>
      </ref>
      <ref id="B5929829">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Darwin Core Maintenance Group</surname>
              <given-names>Biodiversity Information Standards (TDWG)</given-names>
            </name>
          </person-group>
          <year>2014</year>
          <article-title>
            <italic>Darwin Core</italic>
          </article-title>
          <source>Zenodo</source>
          <pub-id pub-id-type="doi">10.5281/zenodo.592792</pub-id>
        </element-citation>
      </ref>
      <ref id="B5929838">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dillen</surname>
              <given-names>Mathias</given-names>
            </name>
            <name name-style="western">
              <surname>Groom</surname>
              <given-names>Quentin</given-names>
            </name>
            <name name-style="western">
              <surname>Chagnoux</surname>
              <given-names>Simon</given-names>
            </name>
            <name name-style="western">
              <surname>Güntsch</surname>
              <given-names>Anton</given-names>
            </name>
            <name name-style="western">
              <surname>Hardisty</surname>
              <given-names>Alex</given-names>
            </name>
            <name name-style="western">
              <surname>Haston</surname>
              <given-names>Elspeth</given-names>
            </name>
            <name name-style="western">
              <surname>Livermore</surname>
              <given-names>Laurence</given-names>
            </name>
            <name name-style="western">
              <surname>Runnel</surname>
              <given-names>Veljo</given-names>
            </name>
            <name name-style="western">
              <surname>Schulman</surname>
              <given-names>Leif</given-names>
            </name>
            <name name-style="western">
              <surname>Willemse</surname>
              <given-names>Luc</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Zhengzhe</given-names>
            </name>
            <name name-style="western">
              <surname>Phillips</surname>
              <given-names>Sarah</given-names>
            </name>
          </person-group>
          <year>2019</year>
          <article-title>A benchmark dataset of herbarium specimen images with label data</article-title>
          <source>Biodiversity Data Journal</source>
          <volume>7</volume>
          <pub-id pub-id-type="doi">10.3897/bdj.7.e31817</pub-id>
        </element-citation>
      </ref>
      <ref id="B5929855">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>DiSSCo</surname>
            </name>
          </person-group>
          <article-title>Distributed System of Scientific Collections</article-title>
          <uri>https://www.dissco.eu/</uri>
          <date-in-citation content-type="access-date">2020-05-30T00:00:00+03:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5929863">
        <element-citation publication-type="conference-preoceeding">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Doleschal</surname>
              <given-names>Johannes</given-names>
            </name>
            <name name-style="western">
              <surname>Kimelfeld</surname>
              <given-names>Benny</given-names>
            </name>
            <name name-style="western">
              <surname>Martens</surname>
              <given-names>Wim</given-names>
            </name>
            <name name-style="western">
              <surname>Peterfreund</surname>
              <given-names>Liat</given-names>
            </name>
          </person-group>
          <year>2020</year>
          <source>Weight Annotation in Information Extraction</source>
          <volume>155</volume>
          <conf-name>23rd International Conference on Database Theory (ICDT 2020)</conf-name>
          <conf-loc>Copenhagen</conf-loc>
          <conf-date>30th March-2nd April, 2020</conf-date>
          <publisher-name>Schloss Dagstuhl – Leibniz-Zentrum für Informatik</publisher-name>
          <publisher-loc>Dagstuhl</publisher-loc>
          <source>Leibniz International Proceedings in Informatics (LIPIcs)</source>
          <size units="page">18</size>
          <uri>https://drops.dagstuhl.de/opus/volltexte/2020/11932/</uri>
          <isbn>978-3-95977-139-9</isbn>
          <pub-id pub-id-type="doi">10.4230/LIPIcs.ICDT.2020.8</pub-id>
        </element-citation>
      </ref>
      <ref id="B5929872">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Drinkwater</surname>
              <given-names>Robyn</given-names>
            </name>
            <name name-style="western">
              <surname>Cubey</surname>
              <given-names>Robert</given-names>
            </name>
            <name name-style="western">
              <surname>Haston</surname>
              <given-names>Elspeth</given-names>
            </name>
          </person-group>
          <year>2014</year>
          <article-title>The use of Optical Character Recognition (OCR) in the digitisation of herbarium specimen labels</article-title>
          <source>PhytoKeys</source>
          <volume>38</volume>
          <fpage>15</fpage>
          <lpage>30</lpage>
          <pub-id pub-id-type="doi">10.3897/phytokeys.38.7168</pub-id>
        </element-citation>
      </ref>
      <ref id="B5929881">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ellwood</surname>
              <given-names>Elizabeth R</given-names>
            </name>
            <name name-style="western">
              <surname>Kimberly</surname>
              <given-names>Paul</given-names>
            </name>
            <name name-style="western">
              <surname>Guralnick</surname>
              <given-names>Robert</given-names>
            </name>
            <name name-style="western">
              <surname>Flemons</surname>
              <given-names>Paul</given-names>
            </name>
            <name name-style="western">
              <surname>Love</surname>
              <given-names>Kevin</given-names>
            </name>
            <name name-style="western">
              <surname>Ellis</surname>
              <given-names>Shari</given-names>
            </name>
            <name name-style="western">
              <surname>Allen</surname>
              <given-names>Julie M</given-names>
            </name>
            <name name-style="western">
              <surname>Best</surname>
              <given-names>Jason H</given-names>
            </name>
            <name name-style="western">
              <surname>Carter</surname>
              <given-names>Richard</given-names>
            </name>
            <name name-style="western">
              <surname>Chagnoux</surname>
              <given-names>Simon</given-names>
            </name>
            <name name-style="western">
              <surname>Costello</surname>
              <given-names>Robert</given-names>
            </name>
            <name name-style="western">
              <surname>Denslow</surname>
              <given-names>Michael W</given-names>
            </name>
            <name name-style="western">
              <surname>Dunckel</surname>
              <given-names>Betty A</given-names>
            </name>
            <name name-style="western">
              <surname>Ferriter</surname>
              <given-names>Meghan M</given-names>
            </name>
            <name name-style="western">
              <surname>Gilbert</surname>
              <given-names>Edward E</given-names>
            </name>
            <name name-style="western">
              <surname>Goforth</surname>
              <given-names>Christine</given-names>
            </name>
            <name name-style="western">
              <surname>Groom</surname>
              <given-names>Quentin</given-names>
            </name>
            <name name-style="western">
              <surname>Krimmel</surname>
              <given-names>Erica R</given-names>
            </name>
            <name name-style="western">
              <surname>LaFrance</surname>
              <given-names>Raphael</given-names>
            </name>
            <name name-style="western">
              <surname>Martinec</surname>
              <given-names>Joann Lacey</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>Andrew N</given-names>
            </name>
            <name name-style="western">
              <surname>Minnaert-Grote</surname>
              <given-names>Jamie</given-names>
            </name>
            <name name-style="western">
              <surname>Nash</surname>
              <given-names>Thomas</given-names>
            </name>
            <name name-style="western">
              <surname>Oboyski</surname>
              <given-names>Peter</given-names>
            </name>
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>Deborah L</given-names>
            </name>
            <name name-style="western">
              <surname>Pearson</surname>
              <given-names>Katelin D</given-names>
            </name>
            <name name-style="western">
              <surname>Pentcheff</surname>
              <given-names>N Dean</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>Mari A</given-names>
            </name>
            <name name-style="western">
              <surname>Seltzer</surname>
              <given-names>Carrie E</given-names>
            </name>
            <name name-style="western">
              <surname>Soltis</surname>
              <given-names>Pamela S</given-names>
            </name>
            <name name-style="western">
              <surname>Stephens</surname>
              <given-names>Rhiannon</given-names>
            </name>
            <name name-style="western">
              <surname>Sweeney</surname>
              <given-names>Patrick W</given-names>
            </name>
            <name name-style="western">
              <surname>von Konrat</surname>
              <given-names>Matt</given-names>
            </name>
            <name name-style="western">
              <surname>Wall</surname>
              <given-names>Adam</given-names>
            </name>
            <name name-style="western">
              <surname>Wetzer</surname>
              <given-names>Regina</given-names>
            </name>
            <name name-style="western">
              <surname>Zimmerman</surname>
              <given-names>Charles</given-names>
            </name>
            <name name-style="western">
              <surname>Mast</surname>
              <given-names>Austin R</given-names>
            </name>
          </person-group>
          <year>2018</year>
          <article-title>Worldwide Engagement for Digitizing Biocollections (WeDigBio): The Biocollections Community's Citizen-Science Space on the Calendar</article-title>
          <source>BioScience</source>
          <volume>68</volume>
          <issue>2</issue>
          <fpage>112</fpage>
          <lpage>124</lpage>
          <pub-id pub-id-type="doi">10.1093/biosci/bix143</pub-id>
        </element-citation>
      </ref>
      <ref id="B5929923">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Engledow</surname>
              <given-names>Henry</given-names>
            </name>
            <name name-style="western">
              <surname>De Smedt</surname>
              <given-names>Sofie</given-names>
            </name>
            <name name-style="western">
              <surname>Bogaerts</surname>
              <given-names>Ann</given-names>
            </name>
            <name name-style="western">
              <surname>Groom</surname>
              <given-names>Quentin</given-names>
            </name>
          </person-group>
          <year>2018</year>
          <article-title>An Evaluation of In-house versus Out-sourced Data Capture at the Meise Botanic Garden (BR)</article-title>
          <source>Biodiversity Information Science and Standards</source>
          <volume>2</volume>
          <pub-id pub-id-type="doi">10.3897/biss.2.26514</pub-id>
        </element-citation>
      </ref>
      <ref id="B5929932">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>EUDAT</surname>
            </name>
          </person-group>
          <article-title>EUDAT &amp; Herbadrop Collaboration</article-title>
          <uri>https://www.eudat.eu/eudat-herbadrop-collaboration</uri>
          <date-in-citation content-type="access-date">2018-10-08T00:00:00+03:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5929940">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Facility</surname>
              <given-names>Finnish Biodiversity Info</given-names>
            </name>
          </person-group>
          <article-title>
            <italic>Suomen Lajitietokeskus</italic>
          </article-title>
          <uri>http://id.luomus.fi/EIG.6494</uri>
          <date-in-citation content-type="access-date">2018-12-22T00:00:00+02:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5929948">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cloud</surname>
              <given-names>Google</given-names>
            </name>
          </person-group>
          <article-title>Detect Text (OCR)</article-title>
          <uri>https://cloud.google.com/vision/docs/ocr</uri>
          <date-in-citation content-type="access-date">2018-12-22T00:00:00+02:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5929956">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Source</surname>
              <given-names>Google Open</given-names>
            </name>
          </person-group>
          <article-title>
            <italic>Tesseract OCR</italic>
          </article-title>
          <uri>https://opensource.google.com/projects/tesseract</uri>
          <date-in-citation content-type="access-date">2018-10-22T00:00:00+03:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5929964">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hardisty</surname>
              <given-names>Alex R.</given-names>
            </name>
            <name name-style="western">
              <surname>Bacall</surname>
              <given-names>Finn</given-names>
            </name>
            <name name-style="western">
              <surname>Beard</surname>
              <given-names>Niall</given-names>
            </name>
            <name name-style="western">
              <surname>Balcázar-Vargas</surname>
              <given-names>Maria-Paula</given-names>
            </name>
            <name name-style="western">
              <surname>Balech</surname>
              <given-names>Bachir</given-names>
            </name>
            <name name-style="western">
              <surname>Barcza</surname>
              <given-names>Zoltán</given-names>
            </name>
            <name name-style="western">
              <surname>Bourlat</surname>
              <given-names>Sarah J.</given-names>
            </name>
            <name name-style="western">
              <surname>De Giovanni</surname>
              <given-names>Renato</given-names>
            </name>
            <name name-style="western">
              <surname>de Jong</surname>
              <given-names>Yde</given-names>
            </name>
            <name name-style="western">
              <surname>De Leo</surname>
              <given-names>Francesca</given-names>
            </name>
            <name name-style="western">
              <surname>Dobor</surname>
              <given-names>Laura</given-names>
            </name>
            <name name-style="western">
              <surname>Donvito</surname>
              <given-names>Giacinto</given-names>
            </name>
            <name name-style="western">
              <surname>Fellows</surname>
              <given-names>Donal</given-names>
            </name>
            <name name-style="western">
              <surname>Guerra</surname>
              <given-names>Antonio Fernandez</given-names>
            </name>
            <name name-style="western">
              <surname>Ferreira</surname>
              <given-names>Nuno</given-names>
            </name>
            <name name-style="western">
              <surname>Fetyukova</surname>
              <given-names>Yuliya</given-names>
            </name>
            <name name-style="western">
              <surname>Fosso</surname>
              <given-names>Bruno</given-names>
            </name>
            <name name-style="western">
              <surname>Giddy</surname>
              <given-names>Jonathan</given-names>
            </name>
            <name name-style="western">
              <surname>Goble</surname>
              <given-names>Carole</given-names>
            </name>
            <name name-style="western">
              <surname>Güntsch</surname>
              <given-names>Anton</given-names>
            </name>
            <name name-style="western">
              <surname>Haines</surname>
              <given-names>Robert</given-names>
            </name>
            <name name-style="western">
              <surname>Ernst</surname>
              <given-names>Vera Hernández</given-names>
            </name>
            <name name-style="western">
              <surname>Hettling</surname>
              <given-names>Hannes</given-names>
            </name>
            <name name-style="western">
              <surname>Hidy</surname>
              <given-names>Dóra</given-names>
            </name>
            <name name-style="western">
              <surname>Horváth</surname>
              <given-names>Ferenc</given-names>
            </name>
            <name name-style="western">
              <surname>Ittzés</surname>
              <given-names>Dóra</given-names>
            </name>
            <name name-style="western">
              <surname>Ittzés</surname>
              <given-names>Péter</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>Andrew</given-names>
            </name>
            <name name-style="western">
              <surname>Kottmann</surname>
              <given-names>Renzo</given-names>
            </name>
            <name name-style="western">
              <surname>Kulawik</surname>
              <given-names>Robert</given-names>
            </name>
            <name name-style="western">
              <surname>Leidenberger</surname>
              <given-names>Sonja</given-names>
            </name>
            <name name-style="western">
              <surname>Lyytikäinen-Saarenmaa</surname>
              <given-names>Päivi</given-names>
            </name>
            <name name-style="western">
              <surname>Mathew</surname>
              <given-names>Cherian</given-names>
            </name>
            <name name-style="western">
              <surname>Morrison</surname>
              <given-names>Norman</given-names>
            </name>
            <name name-style="western">
              <surname>Nenadic</surname>
              <given-names>Aleksandra</given-names>
            </name>
            <name name-style="western">
              <surname>de la Hidalga</surname>
              <given-names>Abraham Nieva</given-names>
            </name>
            <name name-style="western">
              <surname>Obst</surname>
              <given-names>Matthias</given-names>
            </name>
            <name name-style="western">
              <surname>Oostermeijer</surname>
              <given-names>Gerard</given-names>
            </name>
            <name name-style="western">
              <surname>Paymal</surname>
              <given-names>Elisabeth</given-names>
            </name>
            <name name-style="western">
              <surname>Pesole</surname>
              <given-names>Graziano</given-names>
            </name>
            <name name-style="western">
              <surname>Pinto</surname>
              <given-names>Salvatore</given-names>
            </name>
            <name name-style="western">
              <surname>Poigné</surname>
              <given-names>Axel</given-names>
            </name>
            <name name-style="western">
              <surname>Fernandez</surname>
              <given-names>Francisco Quevedo</given-names>
            </name>
            <name name-style="western">
              <surname>Santamaria</surname>
              <given-names>Monica</given-names>
            </name>
            <name name-style="western">
              <surname>Saarenmaa</surname>
              <given-names>Hannu</given-names>
            </name>
            <name name-style="western">
              <surname>Sipos</surname>
              <given-names>Gergely</given-names>
            </name>
            <name name-style="western">
              <surname>Sylla</surname>
              <given-names>Karl-Heinz</given-names>
            </name>
            <name name-style="western">
              <surname>Tähtinen</surname>
              <given-names>Marko</given-names>
            </name>
            <name name-style="western">
              <surname>Vicario</surname>
              <given-names>Saverio</given-names>
            </name>
            <name name-style="western">
              <surname>Vos</surname>
              <given-names>Rutger Aldo</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>Alan R.</given-names>
            </name>
            <name name-style="western">
              <surname>Yilmaz</surname>
              <given-names>Pelin</given-names>
            </name>
          </person-group>
          <year>2016</year>
          <article-title>BioVeL: a virtual laboratory for data analysis and modelling in biodiversity science and ecology</article-title>
          <source>BMC Ecology</source>
          <volume>16</volume>
          <issue>1</issue>
          <pub-id pub-id-type="doi">10.1186/s12898-016-0103-y</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930021">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Haston</surname>
              <given-names>E. ,</given-names>
            </name>
            <name name-style="western">
              <surname>Albenga</surname>
              <given-names>L.</given-names>
            </name>
            <name name-style="western">
              <surname>Chagnoux</surname>
              <given-names>S.</given-names>
            </name>
            <name name-style="western">
              <surname>Drinkwater</surname>
              <given-names>R.</given-names>
            </name>
            <name name-style="western">
              <surname>Durrant</surname>
              <given-names>J.</given-names>
            </name>
            <name name-style="western">
              <surname>Gilbert</surname>
              <given-names>E.</given-names>
            </name>
            <name name-style="western">
              <surname>Glöckler</surname>
              <given-names>F.</given-names>
            </name>
            <name name-style="western">
              <surname>Green</surname>
              <given-names>L.</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>D.</given-names>
            </name>
            <name name-style="western">
              <surname>Holetschek</surname>
              <given-names>J.</given-names>
            </name>
            <name name-style="western">
              <surname>Hudson</surname>
              <given-names>L.</given-names>
            </name>
            <name name-style="western">
              <surname>Kahle</surname>
              <given-names>P.</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>S.</given-names>
            </name>
            <name name-style="western">
              <surname>Kirchhoff</surname>
              <given-names>A.</given-names>
            </name>
            <name name-style="western">
              <surname>Kroupa</surname>
              <given-names>A.</given-names>
            </name>
            <name name-style="western">
              <surname>Kvacek</surname>
              <given-names>J.</given-names>
            </name>
            <name name-style="western">
              <surname>Le Bras</surname>
              <given-names>G.</given-names>
            </name>
            <name name-style="western">
              <surname>Livermore</surname>
              <given-names>L.</given-names>
            </name>
            <name name-style="western">
              <surname>Mühlenberger</surname>
              <given-names>G.</given-names>
            </name>
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>D.</given-names>
            </name>
            <name name-style="western">
              <surname>Philips</surname>
              <given-names>S.</given-names>
            </name>
            <name name-style="western">
              <surname>Smirnova</surname>
              <given-names>L.</given-names>
            </name>
            <name name-style="western">
              <surname>Vacek</surname>
              <given-names>F.</given-names>
            </name>
          </person-group>
          <article-title>
            <italic>D4.2 - Automating data capture from natural history specimens | SYNTHESYS3</italic>
          </article-title>
          <uri>http://synthesys3.myspecies.info/node/695</uri>
          <date-in-citation content-type="access-date">2018-10-21T00:00:00+03:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5930048">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hoehndorf</surname>
              <given-names>Robert</given-names>
            </name>
            <name name-style="western">
              <surname>Alshahrani</surname>
              <given-names>Mona</given-names>
            </name>
            <name name-style="western">
              <surname>Gkoutos</surname>
              <given-names>Georgios V.</given-names>
            </name>
            <name name-style="western">
              <surname>Gosline</surname>
              <given-names>George</given-names>
            </name>
            <name name-style="western">
              <surname>Groom</surname>
              <given-names>Quentin</given-names>
            </name>
            <name name-style="western">
              <surname>Hamann</surname>
              <given-names>Thomas</given-names>
            </name>
            <name name-style="western">
              <surname>Kattge</surname>
              <given-names>Jens</given-names>
            </name>
            <name name-style="western">
              <surname>de Oliveira</surname>
              <given-names>Sylvia Mota</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>Marco</given-names>
            </name>
            <name name-style="western">
              <surname>Sierra</surname>
              <given-names>Soraya</given-names>
            </name>
            <name name-style="western">
              <surname>Smets</surname>
              <given-names>Erik</given-names>
            </name>
            <name name-style="western">
              <surname>Vos</surname>
              <given-names>Rutger A.</given-names>
            </name>
            <name name-style="western">
              <surname>Weiland</surname>
              <given-names>Claus</given-names>
            </name>
          </person-group>
          <year>2016</year>
          <article-title>The flora phenotype ontology (FLOPO): tool for integrating morphological traits and phenotypes of vascular plants</article-title>
          <source>Journal of Biomedical Semantics</source>
          <volume>7</volume>
          <issue>1</issue>
          <pub-id pub-id-type="doi">10.1186/s13326-016-0107-8</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930066">
        <element-citation publication-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Indurkhya</surname>
              <given-names>Nitin</given-names>
            </name>
            <name name-style="western">
              <surname>Damerau</surname>
              <given-names>Fred J.</given-names>
            </name>
          </person-group>
          <year>2010</year>
          <source>Handbook of Natural Language Processing</source>
          <edition>2nd</edition>
          <publisher-name>Chapman and Hall/CRC</publisher-name>
          <publisher-loc>New York</publisher-loc>
          <isbn>9780429149207</isbn>
          <pub-id pub-id-type="doi">10.1201/9781420085938</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930074">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>Ridong</given-names>
            </name>
            <name name-style="western">
              <surname>Banchs</surname>
              <given-names>Rafael E.</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Haizhou</given-names>
            </name>
          </person-group>
          <year>2016</year>
          <article-title>Evaluating and Combining Name Entity Recognition Systems</article-title>
          <source>Proceedings of the Sixth Named Entity Workshop</source>
          <pub-id pub-id-type="doi">10.18653/v1/w16-2703</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930083">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Levenshtein</surname>
              <given-names>V. I.</given-names>
            </name>
          </person-group>
          <year>1966</year>
          <article-title>Binary codes capable of correcting deletions, insertions and reversals</article-title>
          <source>
            <italic>Soviet physics doklady</italic>
          </source>
          <volume>10</volume>
          <issue>8</issue>
          <fpage>707</fpage>
          <lpage>710</lpage>
        </element-citation>
      </ref>
      <ref id="B5930092">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liew</surname>
              <given-names>Chee Sun</given-names>
            </name>
            <name name-style="western">
              <surname>Atkinson</surname>
              <given-names>Malcolm P.</given-names>
            </name>
            <name name-style="western">
              <surname>Galea</surname>
              <given-names>Michelle</given-names>
            </name>
            <name name-style="western">
              <surname>Ang</surname>
              <given-names>Tan Fong</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>Paul</given-names>
            </name>
            <name name-style="western">
              <surname>Hemert</surname>
              <given-names>Jano I. Van</given-names>
            </name>
          </person-group>
          <year>2016</year>
          <article-title>Scientific Workflows</article-title>
          <source>ACM Computing Surveys</source>
          <volume>49</volume>
          <issue>4</issue>
          <fpage>1</fpage>
          <lpage>39</lpage>
          <pub-id pub-id-type="doi">10.1145/3012429</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930103">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ludäscher</surname>
              <given-names>Bertram</given-names>
            </name>
            <name name-style="western">
              <surname>Altintas</surname>
              <given-names>Ilkay</given-names>
            </name>
            <name name-style="western">
              <surname>Berkley</surname>
              <given-names>Chad</given-names>
            </name>
            <name name-style="western">
              <surname>Higgins</surname>
              <given-names>Dan</given-names>
            </name>
            <name name-style="western">
              <surname>Jaeger</surname>
              <given-names>Efrat</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>Matthew</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>Edward A.</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>Jing</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Yang</given-names>
            </name>
          </person-group>
          <year>2006</year>
          <article-title>Scientific workflow management and the Kepler system</article-title>
          <source>Concurrency and Computation: Practice and Experience</source>
          <volume>18</volume>
          <issue>10</issue>
          <fpage>1039</fpage>
          <lpage>1065</lpage>
          <pub-id pub-id-type="doi">10.1002/cpe.994</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930117">
        <element-citation publication-type="conference-paper">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lui</surname>
              <given-names>M.</given-names>
            </name>
            <name name-style="western">
              <surname>Baldwin</surname>
              <given-names>T.</given-names>
            </name>
          </person-group>
          <year>2012</year>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Min</given-names>
            </name>
          </person-group>
          <article-title>langid. py: An off-the-shelf language identification tool</article-title>
          <source>Proceedings of the ACL 2012 System Demonstrations</source>
          <conf-loc>Jeju Island, Korea</conf-loc>
          <conf-date>July 2012</conf-date>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <uri>https://www.aclweb.org/anthology/P12-3005</uri>
        </element-citation>
      </ref>
      <ref id="B5930131">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Corporation</surname>
              <given-names>Microsoft</given-names>
            </name>
          </person-group>
          <article-title>
            <italic>Microsoft OneNote</italic>
          </article-title>
          <uri>http://www.onenote.com/?404&amp;public=1</uri>
          <date-in-citation content-type="access-date">2018-11-22T00:00:00+02:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5930139">
        <element-citation publication-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mori</surname>
              <given-names>S.</given-names>
            </name>
            <name name-style="western">
              <surname>Nishida</surname>
              <given-names>H.</given-names>
            </name>
            <name name-style="western">
              <surname>Yamada</surname>
              <given-names>H.</given-names>
            </name>
          </person-group>
          <year>1999</year>
          <source>
            <italic>Optical character recognition</italic>
          </source>
          <edition>1</edition>
          <publisher-name>Wiley-Interscience</publisher-name>
          <isbn>978-0471308195</isbn>
        </element-citation>
      </ref>
      <ref id="B5930147">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nadeau</surname>
              <given-names>David</given-names>
            </name>
            <name name-style="western">
              <surname>Sekine</surname>
              <given-names>Satoshi</given-names>
            </name>
          </person-group>
          <year>2009</year>
          <article-title>A survey of named entity recognition and classification</article-title>
          <source>Benjamins Current Topics</source>
          <fpage>3</fpage>
          <lpage>28</lpage>
          <pub-id pub-id-type="doi">10.1075/bct.19.03nad</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930156">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Museum</surname>
              <given-names>Natural History</given-names>
            </name>
          </person-group>
          <article-title>Ouratea dariensis Whitef</article-title>
          <uri>https://data.nhm.ac.uk/object/be595f07-73c5-4764-a96c-8b377e3d1507/1586822400000</uri>
          <date-in-citation content-type="access-date">2020-04-15T00:00:00+03:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5930164">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Museum</surname>
              <given-names>Natural History</given-names>
            </name>
          </person-group>
          <article-title>Zwackhiomyces kantvilasii Kondr</article-title>
          <uri>https://data.nhm.ac.uk/object/dfdbbcd3-bcb3-460c-bbb0-6330b2505439/1586822400000</uri>
          <date-in-citation content-type="access-date">2020-04-14T00:00:00+03:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5930172">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Museum</surname>
              <given-names>Natural History</given-names>
            </name>
          </person-group>
          <article-title>Dinosauria Owen, 1841</article-title>
          <uri>https://data.nhm.ac.uk/object/eb6b1ad8-6c16-437c-859e-cd505c4e321f/1586822400000</uri>
          <date-in-citation content-type="access-date">2020-04-15T00:00:00+03:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5930180">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Museum</surname>
              <given-names>Natural History</given-names>
            </name>
          </person-group>
          <article-title>Poecilia picta Regan, 1913</article-title>
          <uri>https://data.nhm.ac.uk/dataset/collection-specimens/resource/05ff2255-c38a-40c9-b657-4ccb55ab2feb/record/625771</uri>
          <date-in-citation content-type="access-date">2020-05-08T00:00:00+03:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5930188">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Museum</surname>
              <given-names>Natural History</given-names>
            </name>
          </person-group>
          <article-title>Capraiella Conci, 1941</article-title>
          <uri>https://data.nhm.ac.uk/object/c65d9a3c-d8f6-4fac-a418-05c3b697cece/1586822400000</uri>
          <date-in-citation content-type="access-date">2020-04-15T00:00:00+03:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5930196">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Museum</surname>
              <given-names>Natural History</given-names>
            </name>
          </person-group>
          <article-title>Bombus (Orientalibombus) haemorrhoidalis Smith, F.</article-title>
          <uri>https://data.nhm.ac.uk/object/745febc7-8222-498a-9969-5f6b12f85ef3/1586822400000</uri>
          <date-in-citation content-type="access-date">2020-04-15T00:00:00+03:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5998994">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nieva de la Hidalga</surname>
              <given-names>Abraham</given-names>
            </name>
            <name name-style="western">
              <surname>Owen</surname>
              <given-names>David</given-names>
            </name>
            <name name-style="western">
              <surname>Spacic</surname>
              <given-names>Irena</given-names>
            </name>
            <name name-style="western">
              <surname>Rosin</surname>
              <given-names>Paul</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Xianfang</given-names>
            </name>
          </person-group>
          <year>2019</year>
          <article-title>Use of Semantic Segmentation for Increasing the Throughput of Digitisation Workflows for Natural History Collections</article-title>
          <source>Biodiversity Information Science and Standards</source>
          <volume>3</volume>
          <pub-id pub-id-type="doi">10.3897/biss.3.37161</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930220">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ooms</surname>
              <given-names>Jeroen</given-names>
            </name>
          </person-group>
          <article-title>Tesseract 4 is here! State of the art OCR in R!</article-title>
          <uri>https://ropensci.org/technotes/2018/11/06/tesseract-40/</uri>
          <date-in-citation content-type="access-date">2018-12-20T00:00:00+02:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5930228">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Owen</surname>
              <given-names>D.</given-names>
            </name>
            <name name-style="western">
              <surname>Groom</surname>
              <given-names>Q.</given-names>
            </name>
            <name name-style="western">
              <surname>Hardisty</surname>
              <given-names>A.</given-names>
            </name>
            <name name-style="western">
              <surname>Leegwater</surname>
              <given-names>T.</given-names>
            </name>
            <name name-style="western">
              <surname>van Walsum</surname>
              <given-names>M.</given-names>
            </name>
            <name name-style="western">
              <surname>Wijkamp</surname>
              <given-names>N.</given-names>
            </name>
            <name name-style="western">
              <surname>Spasić</surname>
              <given-names>I.</given-names>
            </name>
          </person-group>
          <year>2019</year>
          <article-title>Methods for Automated Text Digitisation</article-title>
          <source>Zenodo</source>
          <pub-id pub-id-type="doi">10.5281/zenodo.3364502</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930240">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pyke</surname>
              <given-names>Graham H.</given-names>
            </name>
            <name name-style="western">
              <surname>Ehrlich</surname>
              <given-names>Paul R.</given-names>
            </name>
          </person-group>
          <year>2010</year>
          <article-title>Biological collections and ecological/environmental research: a review, some observations and a look to the future</article-title>
          <source>Biological Reviews</source>
          <volume>85</volume>
          <issue>2</issue>
          <fpage>247</fpage>
          <lpage>266</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1469-185x.2009.00098.x</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930249">
        <element-citation publication-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Riley</surname>
              <given-names>Jenn</given-names>
            </name>
          </person-group>
          <year>2017</year>
          <source>Understanding Metadata: What is Metadata, and What is it For?: A Primer</source>
          <publisher-name>National Information Standards Organization</publisher-name>
          <uri>https://www.niso.org/publications/understanding-metadata-2017</uri>
          <isbn>978-1-937522-72-8</isbn>
        </element-citation>
      </ref>
      <ref id="B5930257">
        <element-citation publication-type="thesis">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Scheidl</surname>
              <given-names>Harald</given-names>
            </name>
          </person-group>
          <year>2018</year>
          <source>Handwritten text recognition in historical documents</source>
          <publisher-name>Vienna University of Technology</publisher-name>
          <publisher-loc>Vienna</publisher-loc>
          <uri>https://repositum.tuwien.ac.at/obvutwhs/content/titleinfo/2874742</uri>
        </element-citation>
      </ref>
      <ref id="B5930265">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shuyo</surname>
              <given-names>N.</given-names>
            </name>
          </person-group>
          <article-title>
            <italic>language-detection</italic>
          </article-title>
          <uri>https://github.com/shuyo/language-detection</uri>
          <date-in-citation content-type="access-date">2018-10-31T00:00:00+02:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5930273">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Spasić</surname>
              <given-names>Irena</given-names>
            </name>
            <name name-style="western">
              <surname>Greenwood</surname>
              <given-names>Mark</given-names>
            </name>
            <name name-style="western">
              <surname>Preece</surname>
              <given-names>Alun</given-names>
            </name>
            <name name-style="western">
              <surname>Francis</surname>
              <given-names>Nick</given-names>
            </name>
            <name name-style="western">
              <surname>Elwyn</surname>
              <given-names>Glyn</given-names>
            </name>
          </person-group>
          <year>2013</year>
          <article-title>FlexiTerm: a flexible term recognition method</article-title>
          <source>Journal of Biomedical Semantics</source>
          <volume>4</volume>
          <issue>1</issue>
          <pub-id pub-id-type="doi">10.1186/2041-1480-4-27</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930283">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Spasić</surname>
              <given-names>Irena</given-names>
            </name>
          </person-group>
          <year>2018</year>
          <article-title>Acronyms as an Integral Part of Multi-Word Term Recognition – A Token of Appreciation</article-title>
          <source>IEEE Access</source>
          <volume>6</volume>
          <fpage>8351</fpage>
          <lpage>8363</lpage>
          <pub-id pub-id-type="doi">10.1109/access.2018.2807122</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930292">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Suarez</surname>
              <given-names>Andrew V.</given-names>
            </name>
            <name name-style="western">
              <surname>Tsutsui</surname>
              <given-names>Neil D.</given-names>
            </name>
          </person-group>
          <year>2004</year>
          <article-title>The Value of Museum Collections for Research and Society</article-title>
          <source>BioScience</source>
          <volume>54</volume>
          <issue>1</issue>
          <fpage>66</fpage>
          <lpage>74</lpage>
          <pub-id pub-id-type="doi">10.1641/0006-3568(2004)054[0066:tvomcf]2.0.co;2</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930301">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Foundation</surname>
              <given-names>The Apache Software</given-names>
            </name>
          </person-group>
          <article-title>
            <italic>Apache Taverna</italic>
          </article-title>
          <uri>https://taverna.incubator.apache.org/</uri>
          <date-in-citation content-type="access-date">2018-10-21T00:00:00+03:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5930309">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thessen</surname>
              <given-names>Anne E.</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>Hong</given-names>
            </name>
            <name name-style="western">
              <surname>Mozzherin</surname>
              <given-names>Dmitry</given-names>
            </name>
          </person-group>
          <year>2012</year>
          <article-title>Applications of Natural Language Processing in Biodiversity Science</article-title>
          <source>Advances in Bioinformatics</source>
          <volume>2012</volume>
          <fpage>1</fpage>
          <lpage>17</lpage>
          <pub-id pub-id-type="doi">10.1155/2012/391574</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930318">
        <element-citation publication-type="website">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Group</surname>
              <given-names>The Stanford Natural Language Processing</given-names>
            </name>
          </person-group>
          <article-title>
            <italic>Stanford Named Entity Recogniser (NER)</italic>
          </article-title>
          <uri>https://nlp.stanford.edu/software/CRF-NER.shtml</uri>
          <date-in-citation content-type="access-date">2018-10-20T00:00:00+03:00</date-in-citation>
        </element-citation>
      </ref>
      <ref id="B5930335">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wheeler</surname>
              <given-names>Q. D.</given-names>
            </name>
            <name name-style="western">
              <surname>Knapp</surname>
              <given-names>S.</given-names>
            </name>
            <name name-style="western">
              <surname>Stevenson</surname>
              <given-names>D. W.</given-names>
            </name>
            <name name-style="western">
              <surname>Stevenson</surname>
              <given-names>J.</given-names>
            </name>
            <name name-style="western">
              <surname>Blum</surname>
              <given-names>S. D.</given-names>
            </name>
            <name name-style="western">
              <surname>Boom</surname>
              <given-names>B. M.</given-names>
            </name>
            <name name-style="western">
              <surname>Borisy</surname>
              <given-names>G. G.</given-names>
            </name>
            <name name-style="western">
              <surname>Buizer</surname>
              <given-names>J. L.</given-names>
            </name>
            <name name-style="western">
              <surname>De Carvalho</surname>
              <given-names>M. R.</given-names>
            </name>
            <name name-style="western">
              <surname>Cibrian</surname>
              <given-names>A.</given-names>
            </name>
            <name name-style="western">
              <surname>Donoghue</surname>
              <given-names>M. J.</given-names>
            </name>
            <name name-style="western">
              <surname>Doyle</surname>
              <given-names>V.</given-names>
            </name>
            <name name-style="western">
              <surname>Gerson</surname>
              <given-names>E. M.</given-names>
            </name>
            <name name-style="western">
              <surname>Graham</surname>
              <given-names>C. H.</given-names>
            </name>
            <name name-style="western">
              <surname>Graves</surname>
              <given-names>P.</given-names>
            </name>
            <name name-style="western">
              <surname>Graves</surname>
              <given-names>S. J.</given-names>
            </name>
            <name name-style="western">
              <surname>Guralnick</surname>
              <given-names>R. P.</given-names>
            </name>
            <name name-style="western">
              <surname>Hamilton</surname>
              <given-names>A. L.</given-names>
            </name>
            <name name-style="western">
              <surname>Hanken</surname>
              <given-names>J.</given-names>
            </name>
            <name name-style="western">
              <surname>Law</surname>
              <given-names>W.</given-names>
            </name>
            <name name-style="western">
              <surname>Lipscomb</surname>
              <given-names>D. L.</given-names>
            </name>
            <name name-style="western">
              <surname>Lovejoy</surname>
              <given-names>T. E.</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>H.</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>J. S.</given-names>
            </name>
            <name name-style="western">
              <surname>Naeem</surname>
              <given-names>S.</given-names>
            </name>
            <name name-style="western">
              <surname>Novacek</surname>
              <given-names>M. J.</given-names>
            </name>
            <name name-style="western">
              <surname>Page</surname>
              <given-names>L. M.</given-names>
            </name>
            <name name-style="western">
              <surname>Platnick</surname>
              <given-names>N. I.</given-names>
            </name>
            <name name-style="western">
              <surname>Porter-Morgan</surname>
              <given-names>H.</given-names>
            </name>
            <name name-style="western">
              <surname>Raven</surname>
              <given-names>P. H.</given-names>
            </name>
            <name name-style="western">
              <surname>Solis</surname>
              <given-names>M. A.</given-names>
            </name>
            <name name-style="western">
              <surname>Valdecasas</surname>
              <given-names>A. G.</given-names>
            </name>
            <name name-style="western">
              <surname>Van Der Leeuw</surname>
              <given-names>S.</given-names>
            </name>
            <name name-style="western">
              <surname>Vasco</surname>
              <given-names>A.</given-names>
            </name>
            <name name-style="western">
              <surname>Vermeulen</surname>
              <given-names>N.</given-names>
            </name>
            <name name-style="western">
              <surname>Vogel</surname>
              <given-names>J.</given-names>
            </name>
            <name name-style="western">
              <surname>Walls</surname>
              <given-names>R. L.</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>E. O.</given-names>
            </name>
            <name name-style="western">
              <surname>Woolley</surname>
              <given-names>J. B.</given-names>
            </name>
          </person-group>
          <year>2012</year>
          <article-title>Mapping the biosphere: exploring species to understand the origin, organization and sustainability of biodiversity</article-title>
          <source>Systematics and Biodiversity</source>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>20</lpage>
          <pub-id pub-id-type="doi">10.1080/14772000.2012.665095</pub-id>
        </element-citation>
      </ref>
      <ref id="B5930379">
        <element-citation publication-type="article">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wieczorek</surname>
              <given-names>John</given-names>
            </name>
            <name name-style="western">
              <surname>Bloom</surname>
              <given-names>David</given-names>
            </name>
            <name name-style="western">
              <surname>Guralnick</surname>
              <given-names>Robert</given-names>
            </name>
            <name name-style="western">
              <surname>Blum</surname>
              <given-names>Stan</given-names>
            </name>
            <name name-style="western">
              <surname>Döring</surname>
              <given-names>Markus</given-names>
            </name>
            <name name-style="western">
              <surname>Giovanni</surname>
              <given-names>Renato</given-names>
            </name>
            <name name-style="western">
              <surname>Robertson</surname>
              <given-names>Tim</given-names>
            </name>
            <name name-style="western">
              <surname>Vieglais</surname>
              <given-names>David</given-names>
            </name>
          </person-group>
          <year>2012</year>
          <article-title>Darwin Core: An Evolving Community-Developed Biodiversity Data Standard</article-title>
          <source>PLoS ONE</source>
          <volume>7</volume>
          <issue>1</issue>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0029715</pub-id>
        </element-citation>
      </ref>
    </ref-list>
  </back>
  <floats-group>
    <fig-group id="F5930393" position="float" orientation="portrait">
      <caption>
        <p>A range of specimens that demonstrate the wide taxonomic range of specimens encountered in collections. They also demonstrate the diversity of label types, which include handwritten, typed, and printed labels. Note the presence of various barcodes, rulers, and a colour chart in addition to labels describing the origin of the specimen and its identity.</p>
      </caption>
      <fig id="F5930398" position="float" orientation="portrait">
        <object-id content-type="arpha">4FDE9BC5-D6EE-5182-8EBB-6A3CE1CF719E</object-id>
        <object-id content-type="doi">10.3897/rio.6.e58030.figure1a</object-id>
        <label>Figure 1a.</label>
        <caption>
          <p>Herbarium specimen (<xref ref-type="bibr" rid="B5930156">Natural History Museum 2007a</xref>)</p>
        </caption>
        <graphic xlink:href="rio-06-e58030-g001_a.jpg" xlink:type="simple" position="float" orientation="portrait" id="oo_403481.jpg">
          <uri content-type="original_file">https://binary.pensoft.net/fig/403481</uri>
        </graphic>
      </fig>
      <fig id="F5930399" position="float" orientation="portrait">
        <object-id content-type="arpha">1291341B-DCCC-54DC-91F3-C2E4A1F02CB4</object-id>
        <object-id content-type="doi">10.3897/rio.6.e58030.figure1b</object-id>
        <label>Figure 1b.</label>
        <caption>
          <p>Pinned insect specimen (<xref ref-type="bibr" rid="B5930196">Natural History Museum 2018</xref>)</p>
        </caption>
        <graphic xlink:href="rio-06-e58030-g001_b.jpg" xlink:type="simple" position="float" orientation="portrait" id="oo_403482.jpg">
          <uri content-type="original_file">https://binary.pensoft.net/fig/403482</uri>
        </graphic>
      </fig>
      <fig id="F5930400" position="float" orientation="portrait">
        <object-id content-type="arpha">E43E02A8-6ED7-58CC-AA2E-3FA8BE9EE92C</object-id>
        <object-id content-type="doi">10.3897/rio.6.e58030.figure1c</object-id>
        <label>Figure 1c.</label>
        <caption>
          <p>Microscope slide (<xref ref-type="bibr" rid="B5930188">Natural History Museum 2017</xref>)</p>
        </caption>
        <graphic xlink:href="rio-06-e58030-g001_c.jpg" xlink:type="simple" position="float" orientation="portrait" id="oo_403487.jpg">
          <uri content-type="original_file">https://binary.pensoft.net/fig/403487</uri>
        </graphic>
      </fig>
      <fig id="F5930401" position="float" orientation="portrait">
        <object-id content-type="arpha">70BAE738-E9CC-5E4F-863D-114C76C8097E</object-id>
        <object-id content-type="doi">10.3897/rio.6.e58030.figure1d</object-id>
        <label>Figure 1d.</label>
        <caption>
          <p>Fossilised animal skin (<xref ref-type="bibr" rid="B5930172">Natural History Museum 2009</xref>)</p>
        </caption>
        <graphic xlink:href="rio-06-e58030-g001_d.jpg" xlink:type="simple" position="float" orientation="portrait" id="oo_403488.jpg">
          <uri content-type="original_file">https://binary.pensoft.net/fig/403488</uri>
        </graphic>
      </fig>
      <fig id="F5930402" position="float" orientation="portrait">
        <object-id content-type="arpha">2D800F07-28F0-535D-B89E-4BF0951794BF</object-id>
        <object-id content-type="doi">10.3897/rio.6.e58030.figure1e</object-id>
        <label>Figure 1e.</label>
        <caption>
          <p>Liquid preserved specimen (<xref ref-type="bibr" rid="B5930180">Natural History Museum 2010</xref>)</p>
        </caption>
        <graphic xlink:href="rio-06-e58030-g001_e.jpg" xlink:type="simple" position="float" orientation="portrait" id="oo_408321.jpg">
          <uri content-type="original_file">https://binary.pensoft.net/fig/408321</uri>
        </graphic>
      </fig>
    </fig-group>
    <fig id="F5930404" position="float" orientation="portrait">
      <object-id content-type="arpha">59837838-199C-50B6-8B26-E694AA20E723</object-id>
      <object-id content-type="doi">10.3897/rio.6.e58030.figure2</object-id>
      <label>Figure 2.</label>
      <caption>
        <p>A possible semi-automatic digitisation workflow to extract data from the labels of collection specimens.</p>
      </caption>
      <graphic xlink:href="rio-06-e58030-g002.png" position="float" id="oo_434806.png" orientation="portrait" xlink:type="simple">
        <uri content-type="original_file">https://binary.pensoft.net/fig/434806</uri>
      </graphic>
    </fig>
    <fig id="F5930406" position="float" orientation="portrait">
      <object-id content-type="arpha">EB72A160-E478-5B06-98EE-0A6144EBA045</object-id>
      <object-id content-type="doi">10.3897/rio.6.e58030.figure3</object-id>
      <label>Figure 3.</label>
      <caption>
        <p>The criteria used by each contributing institution to select a test set of 200 herbarium specimens. We did not attempt global coverage but instead aimed at a representative sample from BR=Brazil, CN=China, ID=Indonesia, AU=Australasia, US=United States of America, and TZ=Tanzania.</p>
      </caption>
      <graphic xlink:href="rio-06-e58030-g003.png" position="float" id="oo_396805.png" orientation="portrait" xlink:type="simple">
        <uri content-type="original_file">https://binary.pensoft.net/fig/396805</uri>
      </graphic>
    </fig>
    <fig id="F5930408" position="float" orientation="portrait">
      <object-id content-type="arpha">12E249C7-A46A-5E56-86DC-A65EC67C9B8D</object-id>
      <object-id content-type="doi">10.3897/rio.6.e58030.figure4</object-id>
      <label>Figure 4.</label>
      <caption>
        <p>An example of specimen labels. 1=Title, 2=Barcode, 3=Species name, 4=Determined by and date, 5=Locality, 6=Habitat and altitude, 7=Notes, 8=Collector name, specimen number, and collection date.</p>
      </caption>
      <graphic xlink:href="rio-06-e58030-g004.png" position="float" id="oo_396806.png" orientation="portrait" xlink:type="simple">
        <uri content-type="original_file">https://binary.pensoft.net/fig/396806</uri>
      </graphic>
    </fig>
    <fig id="F5930410" position="float" orientation="portrait">
      <object-id content-type="arpha">0363A109-4313-5111-AA60-5E66A5473EAA</object-id>
      <object-id content-type="doi">10.3897/rio.6.e58030.figure5</object-id>
      <label>Figure 5.</label>
      <caption>
        <p>An impression of the different challenges presented by specimen image segments. 1=Label with both printed and handwritten text, 2=Printed label oriented vertically, 3=Barcode composed of irrelevant characters, 4=Colour chart containing no text, 5=Ruler containing no useful text.</p>
      </caption>
      <graphic xlink:href="rio-06-e58030-g005.jpg" position="float" id="oo_396807.jpg" orientation="portrait" xlink:type="simple">
        <uri content-type="original_file">https://binary.pensoft.net/fig/396807</uri>
      </graphic>
    </fig>
    <fig id="F5930412" position="float" orientation="portrait">
      <object-id content-type="arpha">1914091F-C4C7-5575-A5CD-C942DD0CC306</object-id>
      <object-id content-type="doi">10.3897/rio.6.e58030.figure6</object-id>
      <label>Figure 6.</label>
      <caption>
        <p>An example of an instantiated Darwin Core record.</p>
      </caption>
      <graphic xlink:href="rio-06-e58030-g006.png" position="float" id="oo_396809.png" orientation="portrait" xlink:type="simple">
        <uri content-type="original_file">https://binary.pensoft.net/fig/396809</uri>
      </graphic>
    </fig>
    <fig id="F5930414" position="float" orientation="portrait">
      <object-id content-type="arpha">2C68A44A-0679-59EF-B72E-9A2090B65E41</object-id>
      <object-id content-type="doi">10.3897/rio.6.e58030.figure7</object-id>
      <label>Figure 7.</label>
      <caption>
        <p>Measuring OCR accuracy.</p>
        <p>Specimen source: NHM Data Portal (<xref ref-type="bibr" rid="B5930164">Natural History Museum 2007b</xref>).</p>
      </caption>
      <graphic xlink:href="rio-06-e58030-g007.png" position="float" id="oo_396811.png" orientation="portrait" xlink:type="simple">
        <uri content-type="original_file">https://binary.pensoft.net/fig/396811</uri>
      </graphic>
    </fig>
    <fig id="F5930416" position="float" orientation="portrait">
      <object-id content-type="arpha">0752A078-F839-5F3C-A801-6815102C95AF</object-id>
      <object-id content-type="doi">10.3897/rio.6.e58030.figure8</object-id>
      <label>Figure 8.</label>
      <caption>
        <p>Comparison of Levenshtein distance scores for ABBYY FineReader Engine and Google Cloud Vision for selected fields, Lev<sub>year</sub>&gt;0 excluded.</p>
      </caption>
      <graphic xlink:href="rio-06-e58030-g008.png" position="float" id="oo_396813.png" orientation="portrait" xlink:type="simple">
        <uri content-type="original_file">https://binary.pensoft.net/fig/396813</uri>
      </graphic>
    </fig>
    <fig id="F5930418" position="float" orientation="portrait">
      <object-id content-type="arpha">3D4D35F7-C0A7-5973-92E4-E994F3BD6DD8</object-id>
      <object-id content-type="doi">10.3897/rio.6.e58030.figure9</object-id>
      <label>Figure 9.</label>
      <caption>
        <p>A summary of the Levenshtein distance scores for different label elements from handwritten text recognition using ABBYY FineReader Engine. HTR results are compared to label data interpreted by humans.</p>
      </caption>
      <graphic xlink:href="rio-06-e58030-g009.png" position="float" id="oo_396815.png" orientation="portrait" xlink:type="simple">
        <uri content-type="original_file">https://binary.pensoft.net/fig/396815</uri>
      </graphic>
    </fig>
    <fig id="F5930420" position="float" orientation="portrait">
      <object-id content-type="arpha">97DC558D-55FC-58F7-9F96-25A91930B9E5</object-id>
      <object-id content-type="doi">10.3897/rio.6.e58030.figure10</object-id>
      <label>Figure 10.</label>
      <caption>
        <p>A summary of the Levenshtein distance scores for different label elements from handwritten text recognition using Google Cloud Vision. HTR results are compared to label data interpreted by humans.</p>
      </caption>
      <graphic xlink:href="rio-06-e58030-g010.png" position="float" id="oo_396816.png" orientation="portrait" xlink:type="simple">
        <uri content-type="original_file">https://binary.pensoft.net/fig/396816</uri>
      </graphic>
    </fig>
    <fig id="F5930422" position="float" orientation="portrait">
      <object-id content-type="arpha">0B054B06-18AE-5329-9AF3-F443503DFBE0</object-id>
      <object-id content-type="doi">10.3897/rio.6.e58030.figure11</object-id>
      <label>Figure 11.</label>
      <caption>
        <p>The distribution of languages across the specimen and herbaria. EN=English, FR=French, LA=Latin, ET=Estonian, DE=German, NL=Dutch, PT=Portuguese, ES=Spanish, SV=Swedish, RU=Russian, FI=Finnish, IT=Italian, ZZ=Unknown. The codes for the contributing herbaria are listed in Table <xref ref-type="table" rid="T5930428">1</xref> (from <xref ref-type="bibr" rid="B5929838">Dillen et al. 2019</xref>).</p>
      </caption>
      <graphic xlink:href="rio-06-e58030-g011.png" position="float" id="oo_396817.png" orientation="portrait" xlink:type="simple">
        <uri content-type="original_file">https://binary.pensoft.net/fig/396817</uri>
      </graphic>
    </fig>
    <fig id="F5930424" position="float" orientation="portrait">
      <object-id content-type="arpha">BC467F13-6CD1-5747-9EA2-A25C7D3A3CE9</object-id>
      <object-id content-type="doi">10.3897/rio.6.e58030.figure12</object-id>
      <label>Figure 12.</label>
      <caption>
        <p>An example of a specimen label used in named entity recognition. The output of the process is presented in Fig. <xref ref-type="fig" rid="F5930426">13</xref>.</p>
      </caption>
      <graphic xlink:href="rio-06-e58030-g012.jpg" position="float" id="oo_396818.jpg" orientation="portrait" xlink:type="simple">
        <uri content-type="original_file">https://binary.pensoft.net/fig/396818</uri>
      </graphic>
    </fig>
    <fig id="F5930426" position="float" orientation="portrait">
      <object-id content-type="arpha">2F37B46A-3A58-57CC-B95D-2629A461D044</object-id>
      <object-id content-type="doi">10.3897/rio.6.e58030.figure13</object-id>
      <label>Figure 13.</label>
      <caption>
        <p>Gold standard versus NER output of the label in Fig. <xref ref-type="fig" rid="F5930424">12</xref>.</p>
      </caption>
      <graphic xlink:href="rio-06-e58030-g013.png" position="float" id="oo_396819.png" orientation="portrait" xlink:type="simple">
        <uri content-type="original_file">https://binary.pensoft.net/fig/396819</uri>
      </graphic>
    </fig>
    <table-wrap id="T5930428" position="float" orientation="portrait">
      <label>Table 1.</label>
      <caption>
        <p>Contributing institutions and their codes from <ext-link ext-link-type="uri" xlink:href="http://sweetgum.nybg.org/science/ih/">Index Herbariorum.</ext-link></p>
      </caption>
      <table rules="all" border="1">
        <tbody>
          <tr>
            <th rowspan="1" colspan="1">Institution</th>
            <th rowspan="1" colspan="1">Index Herbariorum Code</th>
            <th rowspan="1" colspan="1">ICEDIG Partner</th>
          </tr>
        </tbody>
        <tbody>
          <tr>
            <td rowspan="1" colspan="1">Naturalis Biodiversity Center, Leiden, Netherlands</td>
            <td rowspan="1" colspan="1">L</td>
            <td rowspan="1" colspan="1">Yes</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">Meise Botanic Garden, Meise, Belgium</td>
            <td rowspan="1" colspan="1">BR</td>
            <td rowspan="1" colspan="1">Yes</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">University of Tartu, Tartu, Estonia</td>
            <td rowspan="1" colspan="1">TU</td>
            <td rowspan="1" colspan="1">Yes</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">The Natural History Museum, London, United Kingdom</td>
            <td rowspan="1" colspan="1">BM</td>
            <td rowspan="1" colspan="1">Yes</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">Muséum national d'Histoire naturelle (MNHN), Paris, France</td>
            <td rowspan="1" colspan="1">P</td>
            <td rowspan="1" colspan="1">Yes</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">Royal Botanic Gardens, Kew (RGBK), Richmond, United Kingdom</td>
            <td rowspan="1" colspan="1">K</td>
            <td rowspan="1" colspan="1">Yes</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">Finnish Museum of Natural History, Helsinki, Finland</td>
            <td rowspan="1" colspan="1">H</td>
            <td rowspan="1" colspan="1">Yes</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">Botanic Garden and Botanical Museum, Berlin, Germany</td>
            <td rowspan="1" colspan="1">B</td>
            <td rowspan="1" colspan="1">No</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">Royal Botanic Garden Edinburgh, United Kingdom</td>
            <td rowspan="1" colspan="1">E</td>
            <td rowspan="1" colspan="1">No</td>
          </tr>
        </tbody>
      </table>
    </table-wrap>
    <table-wrap id="T5930429" position="float" orientation="portrait">
      <label>Table 2.</label>
      <caption>
        <p>A summary of specimen properties. The Names and Index Herbariorum codes for the contributing herbaria are listed in Table <xref ref-type="table" rid="T5930428">1</xref>.</p>
      </caption>
      <table rules="all" border="1">
        <tbody>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Contributor</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Words Per Specimen</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Handwritten Content</bold>
            </td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">BR</td>
            <td rowspan="1" colspan="1">47</td>
            <td rowspan="1" colspan="1">49.0%</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">H</td>
            <td rowspan="1" colspan="1">77</td>
            <td rowspan="1" colspan="1">21.3%</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">P</td>
            <td rowspan="1" colspan="1">45</td>
            <td rowspan="1" colspan="1">42.3%</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">L</td>
            <td rowspan="1" colspan="1">64</td>
            <td rowspan="1" colspan="1">22.0%</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">BM</td>
            <td rowspan="1" colspan="1">59</td>
            <td rowspan="1" colspan="1">32.8%</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">B</td>
            <td rowspan="1" colspan="1">61</td>
            <td rowspan="1" colspan="1">50.1%</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">E</td>
            <td rowspan="1" colspan="1">54</td>
            <td rowspan="1" colspan="1">68.0%</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">K</td>
            <td rowspan="1" colspan="1">79</td>
            <td rowspan="1" colspan="1">17.8%</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">TU</td>
            <td rowspan="1" colspan="1">26</td>
            <td rowspan="1" colspan="1">62.2%</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Average</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>57</bold>
            </td>
            <td rowspan="1" colspan="1"><bold>40.6</bold>%</td>
          </tr>
        </tbody>
      </table>
    </table-wrap>
    <table-wrap id="T5930430" position="float" orientation="portrait">
      <label>Table 3.</label>
      <caption>
        <p>Comparison of selected OCR software tools.</p>
      </caption>
      <table rules="all" border="1">
        <tbody>
          <tr>
            <td rowspan="1" colspan="1"/>
            <td rowspan="1" colspan="1">
              <bold>Founded Year</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Latest Stable Version</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>License</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Windows</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Macintosh</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Linux</bold>
            </td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Tesseract</bold>
            </td>
            <td rowspan="1" colspan="1">1985</td>
            <td rowspan="1" colspan="1">4.0.0</td>
            <td rowspan="1" colspan="1">Apache</td>
            <td rowspan="1" colspan="1">Windows 10</td>
            <td rowspan="1" colspan="1">Mac OS X<break/>10.14.x</td>
            <td rowspan="1" colspan="1">Ubuntu 18.04, 18.10</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>ABBYY FineReader Engine</bold>
            </td>
            <td rowspan="1" colspan="1">1989</td>
            <td rowspan="1" colspan="1">12.0</td>
            <td rowspan="1" colspan="1">Proprietary</td>
            <td rowspan="1" colspan="1">Windows 10, 8.1, 8, 7-SP1</td>
            <td rowspan="1" colspan="1">Mac OS X 10.12.x, 10.13.x</td>
            <td rowspan="1" colspan="1">Ubuntu 17.10, 16.04.1, 14.04.5</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Microsoft OneNote</bold>
            </td>
            <td rowspan="1" colspan="1">2012</td>
            <td rowspan="1" colspan="1">17.10325.20049</td>
            <td rowspan="1" colspan="1">Proprietary</td>
            <td rowspan="1" colspan="1">Windows 10, 8.1</td>
            <td rowspan="1" colspan="1">Mac OS X, 10.12 or later</td>
            <td rowspan="1" colspan="1">Ubuntu 18.04, 18.10</td>
          </tr>
        </tbody>
      </table>
    </table-wrap>
    <table-wrap id="T5930431" position="float" orientation="portrait">
      <label>Table 4.</label>
      <caption>
        <p>Processing times for OCR programs using whole images and segments.</p>
      </caption>
      <table rules="all" border="1">
        <tbody>
          <tr>
            <td rowspan="2" colspan="1"/>
            <td rowspan="1" colspan="3">
              <bold>Processing Time (h:m:s)</bold>
            </td>
            <td rowspan="1" colspan="1"/>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>250 Whole Images</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>1,837 Segments</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Difference</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Difference</bold>
              <break/>
              <bold>(Percentage Saving)</bold>
            </td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Tesseract 4.0.0</bold>
            </td>
            <td rowspan="1" colspan="1">01:06:05</td>
            <td rowspan="1" colspan="1">00:45:02</td>
            <td rowspan="1" colspan="1">-00:21:03</td>
            <td rowspan="1" colspan="1">-31.9%</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Tesseract 3.0.51</bold>
            </td>
            <td rowspan="1" colspan="1">00:50:02</td>
            <td rowspan="1" colspan="1">00:23:17</td>
            <td rowspan="1" colspan="1">-00:26:45</td>
            <td rowspan="1" colspan="1">-53.5%</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>ABBYY FineReader Engine 12.0</bold>
            </td>
            <td rowspan="1" colspan="1">01:18:15</td>
            <td rowspan="1" colspan="1">00:29:24</td>
            <td rowspan="1" colspan="1">-00:48:51</td>
            <td rowspan="1" colspan="1">-62.4%</td>
          </tr>
        </tbody>
      </table>
    </table-wrap>
    <table-wrap id="T5930432" position="float" orientation="portrait">
      <label>Table 5.</label>
      <caption>
        <p>Line correctness for OCR using whole images and their segments.</p>
      </caption>
      <table rules="all" border="1">
        <tbody>
          <tr>
            <td rowspan="1" colspan="1"/>
            <td rowspan="1" colspan="1">
              <bold>5 Whole Images</bold>
              <break/>
              <bold>Mean Line Correctness (%)</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>22 Segments</bold>
              <break/>
              <bold>Mean Line Correctness (%)</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Difference</bold>
            </td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Tesseract 4.0.0</bold>
            </td>
            <td rowspan="1" colspan="1">72.8</td>
            <td rowspan="1" colspan="1">75.2</td>
            <td rowspan="1" colspan="1">+2.4</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Tesseract 3.0.51</bold>
            </td>
            <td rowspan="1" colspan="1">44.1</td>
            <td rowspan="1" colspan="1">63.7</td>
            <td rowspan="1" colspan="1">+19.6</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>ABBYY FineReader Engine 12.0</bold>
            </td>
            <td rowspan="1" colspan="1">61.0</td>
            <td rowspan="1" colspan="1">77.3</td>
            <td rowspan="1" colspan="1">+16.3</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Microsoft OneNote 2013</bold>
            </td>
            <td rowspan="1" colspan="1">78.9</td>
            <td rowspan="1" colspan="1">65.5</td>
            <td rowspan="1" colspan="1">-13.4</td>
          </tr>
        </tbody>
      </table>
    </table-wrap>
    <table-wrap id="T5930433" position="float" orientation="portrait">
      <label>Table 6.</label>
      <caption>
        <p>Language identification software tools and their properties.</p>
      </caption>
      <table rules="all" border="1">
        <tbody>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Software</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Licence</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Organisation</bold>
            </td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">langid.py</td>
            <td rowspan="1" colspan="1">Open Source</td>
            <td rowspan="1" colspan="1">University of Melbourne</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">langdetect</td>
            <td rowspan="1" colspan="1">Apache License Version 2.0</td>
            <td rowspan="1" colspan="1">N/A</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">language-detection</td>
            <td rowspan="1" colspan="1">Apache License Version 2.0</td>
            <td rowspan="1" colspan="1">Cybozu Labs, Inc.</td>
          </tr>
        </tbody>
      </table>
    </table-wrap>
    <table-wrap id="T5930434" position="float" orientation="portrait">
      <label>Table 7.</label>
      <caption>
        <p>Example of langid.py usage with fragments of OCR text. Output lines denote the language identified in the input text and the probability estimate for the language.</p>
      </caption>
      <table rules="all" border="1">
        <tbody>
          <tr>
            <td rowspan="1" colspan="1"><bold>Input</bold>: “Unangwa Hill about 6 km. E. of Songea in crevices in vertical rock faces”<break/><bold>Output</bold>: English [99%]</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1"><bold>Input</bold>: “Herbier de Jardin botanique de l'Etat”<break/><bold>Output</bold>: French [99%]</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1"><bold>Input</bold>: “Tartu olikooli juures oleva loodusuurijate seltsi botaanika sekstsiooni”<break/><bold>Output</bold>: Estonian [99%]</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1"><bold>Input</bold>: “Arbusto de ca. 2 m, média ramificação.”<break/><bold>Output</bold>: Portuguese [100%]</td>
          </tr>
        </tbody>
      </table>
    </table-wrap>
    <table-wrap id="T5930435" position="float" orientation="portrait">
      <label>Table 8.</label>
      <caption>
        <p>Confusion matrix for predicted and actual labels.</p>
      </caption>
      <table rules="all" border="1">
        <tbody>
          <tr>
            <td rowspan="2" colspan="2"/>
            <td rowspan="1" colspan="2">
              <bold>Predicted (NER)</bold>
            </td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Negative</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Positive</bold>
            </td>
          </tr>
          <tr>
            <td rowspan="2" colspan="1">
              <bold>Actual</bold>
              <break/>
              <bold>(Gold Standard)</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Negative</bold>
            </td>
            <td rowspan="1" colspan="1">True Negative</td>
            <td rowspan="1" colspan="1">False Positive</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Positive</bold>
            </td>
            <td rowspan="1" colspan="1">False Negative</td>
            <td rowspan="1" colspan="1">True Positive</td>
          </tr>
        </tbody>
      </table>
    </table-wrap>
    <table-wrap id="T5930436" position="float" orientation="portrait">
      <label>Table 9.</label>
      <caption>
        <p>NER performance on OCR text retrieved from whole images.</p>
      </caption>
      <table rules="all" border="1">
        <tbody>
          <tr>
            <td rowspan="1" colspan="1"/>
            <td rowspan="1" colspan="1">
              <bold>PERSON</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>LOCATION</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Overall</bold>
            </td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Precision</bold>
            </td>
            <td rowspan="1" colspan="1">0.81</td>
            <td rowspan="1" colspan="1">0.38</td>
            <td rowspan="1" colspan="1">0.69</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Recall</bold>
            </td>
            <td rowspan="1" colspan="1">0.71</td>
            <td rowspan="1" colspan="1">0.21</td>
            <td rowspan="1" colspan="1">0.53</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>F1</bold>
            </td>
            <td rowspan="1" colspan="1">0.76</td>
            <td rowspan="1" colspan="1">0.27</td>
            <td rowspan="1" colspan="1">0.60</td>
          </tr>
        </tbody>
      </table>
    </table-wrap>
    <table-wrap id="T5930437" position="float" orientation="portrait">
      <label>Table 10.</label>
      <caption>
        <p>NER performance on OCR text retrieved from image segments.</p>
      </caption>
      <table rules="all" border="1">
        <tbody>
          <tr>
            <td rowspan="1" colspan="1"/>
            <td rowspan="1" colspan="1">
              <bold>PERSON</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>LOCATION</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Overall</bold>
            </td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Precision</bold>
            </td>
            <td rowspan="1" colspan="1">0.85</td>
            <td rowspan="1" colspan="1">0.43</td>
            <td rowspan="1" colspan="1">0.74</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Recall</bold>
            </td>
            <td rowspan="1" colspan="1">0.74</td>
            <td rowspan="1" colspan="1">0.50</td>
            <td rowspan="1" colspan="1">0.69</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>F1</bold>
            </td>
            <td rowspan="1" colspan="1">0.79</td>
            <td rowspan="1" colspan="1">0.46</td>
            <td rowspan="1" colspan="1">0.71</td>
          </tr>
        </tbody>
      </table>
    </table-wrap>
    <table-wrap id="T5930438" position="float" orientation="portrait">
      <label>Table 11.</label>
      <caption>
        <p>A summary of recommendations.</p>
      </caption>
      <table rules="all" border="1">
        <tbody>
          <tr>
            <td rowspan="1" colspan="1">
              <bold>Task</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Software</bold>
            </td>
            <td rowspan="1" colspan="1">
              <bold>Comment</bold>
            </td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">Optical Character Recognition</td>
            <td rowspan="1" colspan="1">Tesseract 4.0.0</td>
            <td rowspan="1" colspan="1">Robust with respect to image segmentation</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">Handwritten Text Recognition</td>
            <td rowspan="1" colspan="1">Google Cloud Vision</td>
            <td rowspan="1" colspan="1">Supports 56 languages</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">Language identification</td>
            <td rowspan="1" colspan="1">langid.py</td>
            <td rowspan="1" colspan="1">Supports 97 languages</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">Named Entity Recognition</td>
            <td rowspan="1" colspan="1">Stanford NER</td>
            <td rowspan="1" colspan="1">A wide variety of entities recognised including location, organisation, date, time, and person</td>
          </tr>
          <tr>
            <td rowspan="1" colspan="1">Terminology extraction</td>
            <td rowspan="1" colspan="1">FlexiTerm</td>
            <td rowspan="1" colspan="1">Robust with respect to orthographic variations (such as those introduced by OCR)</td>
          </tr>
        </tbody>
      </table>
    </table-wrap>
    <supplementary-material id="S5930392" orientation="portrait" position="float" xlink:type="simple">
      <object-id content-type="arpha">FD722EF8-8396-5DA3-A476-8A2D68583597</object-id>
      <object-id content-type="doi">10.3897/rio.6.e58030.suppl1</object-id>
      <label>Supplementary material 1</label>
      <caption>
        <p>Appendices</p>
      </caption>
      <statement content-type="dataType">
        <label>Data type</label>
        <p>text, images</p>
      </statement>
      <statement content-type="notes">
        <label>Brief description</label>
        <p>For the sake of brevity the Appendices can be found in this supplementary document. The document contains the following principal information concerning the Digitisation Experiments:</p>
        <p><list list-type="bullet"><list-item><p>OCR Software Settings</p></list-item><list-item><p>OCR Line Correctness Analysis Data</p></list-item><list-item><p>NER Analysis Data</p></list-item><list-item><p>Non-standard Terminology Extraction Analysis Data</p></list-item></list> </p>
      </statement>
      <p>File: oo_425114.pdf</p>
      <media xlink:href="rio-06-e58030-s001.pdf" mimetype="Adobe Acrobat Document" mime-subtype="pdf" position="float" orientation="portrait" xlink:type="simple">
        <uri content-type="original_file">https://binary.pensoft.net/file/425114</uri>
      </media>
      <attrib specific-use="authors">David Owen</attrib>
    </supplementary-material>
  </floats-group>
</article>
