<?xml version="1.0" encoding="UTF-8"?>
<teiHeader xml:id="NKJP_header" n="1.0" xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en" type="corpus">
 <fileDesc>    
  <titleStmt>
   <title xml:lang="pl">Narodowy Korpus Języka Polskiego</title>
   <title xml:lang="en">National Corpus of Polish</title>
   <funder xml:lang="pl" xml:id="mnisw">Ministerstwo Nauki i Szkolnictwa Wyższego (Polska)</funder>
   <funder xml:lang="en">Ministry of Science and Higher Education (Poland)</funder>

   <respStmt>
    <orgName xml:id="ipipan">
     <orgName xml:lang="en">Institute of Computer Science at the Polish Academy of Sciences</orgName>
     <orgName xml:lang="pl">Instytut Podstaw Informatyki Polskiej Akademii Nauk</orgName>
    </orgName>
    <orgName xml:id="ijppan">
     <orgName xml:lang="en">Institute of Polish Language at the Polish Academy of Sciences</orgName>
     <orgName xml:lang="pl">Instytut Języka Polskiego Polskiej Akademii Nauk</orgName>
    </orgName>
    <orgName xml:id="pelcra">
     <orgName xml:lang="en">University of Łódź</orgName>
     <orgName xml:lang="pl">Uniwersytet Łódzki</orgName>
    </orgName>
    <orgName xml:id="pwn">
     <orgName xml:lang="en">Polish Scientific Publishers PWN</orgName>
     <orgName xml:lang="pl">Wydawnictwo Naukowe PWN</orgName>
    </orgName>
    <resp>NKJP Consortium responsible for the development of the corpus</resp>
   </respStmt>

   <respStmt>
    <persName xml:id="adamp">Adam Przepiórkowski, 
     <email>adamp@ipipan.waw.pl</email>
    </persName>
    <resp>head of the team at <ref target="#ipipan">the Institute of Computer Science at the Polish Academy of Sciences</ref></resp> 
    <resp>project coordinator</resp> 
   </respStmt>
   <respStmt>
    <persName xml:id="rlg">Rafał L. Górski, 
     <email>RafalG@ijp-pan.krakow.pl</email>
    </persName>
    <resp>head of the team at <ref target="#ijppan">the Institute of Polish Language at the Polish Academy of Sciences</ref></resp> 
   </respStmt>
   <respStmt>
    <persName xml:id="blt">Barbara Lewandowska-Tomaszczyk, 
     <email>blt@uni.lodz.pl</email>
    </persName>
    <resp>head of the team at <ref target="#pelcra">the University of Łódź</ref></resp> 
   </respStmt>
   <respStmt>
    <persName xml:id="marekl">Marek Łaziński, 
     <email>M.Lazinski@uw.edu.pl</email></persName>
    <resp>head of the team at <ref target="#pwn">the Polish Scientific Publishers PWN</ref></resp> 
   </respStmt>
   <respStmt>
    <persName xml:id="mbanko">Mirosław Bańko</persName>
    <resp>representative of <ref target="#pwn">the Polish Scientific Publishers PWN</ref></resp> 
   </respStmt>
   <respStmt>
    <persName xml:id="iwill">Izabela Will</persName>
    <resp>project administration in 2008</resp>
    <resp>manual annotation of a 1-million word subcorpus</resp>
   </respStmt>
   <respStmt>
    <persName xml:id="beataw">Beata Wójtowicz</persName>
    <resp>project administration in 2009-2011</resp>
   </respStmt>
   <respStmt>
    <persName xml:id="bansp">Piotr Bański</persName>
    <resp>initial design of various XML schemata</resp>
   </respStmt>
   <respStmt>
    <persName xml:id="ldegorski">Łukasz Degórski</persName>
    <resp>general technical responsibilities in the project (at <ref target="#ipipan">Warsaw</ref>)</resp>
   </respStmt>
   <respStmt>
    <persName xml:id="pezik">Piotr Pęzik</persName>
    <resp>general technical responsibilities in the project (at <ref target="#pelcra">Łódź</ref>)</resp>
   </respStmt>
   <respStmt>
    <persName xml:id="ldrozdz">Łukasz Dróżdż</persName>
    <resp>general technical responsibilities in the project (at <ref target="#pelcra">Łódź</ref>)</resp>
   </respStmt>
   <respStmt>
    <persName xml:id="jwilk">Jakub Wilk</persName>
    <resp>maintenance of the <ref target="http://nkjp.pl/">http://nkjp.pl/</ref> pages</resp>
    <resp>maintenance and development of the Poliqarp search engine</resp>
   </respStmt>
   <respStmt>
    <persName xml:id="anna">Anna Andrzejczuk</persName>
    <resp>maintenance of a list of corpus texts</resp>
    <resp>manual annotation of a 1-million word subcorpus</resp>
   </respStmt>
   <respStmt>
    <persName xml:id="czelan">Anna Czelakowska</persName>
    <resp>manual annotation of a 1-million word subcorpus</resp>
   </respStmt>
   <respStmt>
    <persName xml:id="lewdorota">Dorota Lewandowska</persName>
    <resp>manual annotation of a 1-million word subcorpus</resp>
   </respStmt>
  </titleStmt>
  
  <editionStmt>
   <edition>unstable, under development</edition>
  </editionStmt>
  
  <publicationStmt>
   <pubPlace>Warsaw, Poland</pubPlace>
   <address>
    <addrLine xml:lang="pl">Instytut Podstaw Informatyki PAN</addrLine>
    <addrLine xml:lang="pl">ul. Ordona 21</addrLine>
    <addrLine xml:lang="pl">01-237 Warszawa</addrLine>
    <addrLine>Poland</addrLine>
    <addrLine>tel. (+48 22) 8362841, fax (+48 22) 8376564</addrLine>
    <addrLine><email>ipi@ipipan.waw.pl</email></addrLine>
    <addrLine><ref target="http://nkjp.pl/" n="www">http://nkjp.pl/</ref></addrLine>
   </address>
   <publisher>Institute of Computer Science, Polish Academy of Sciences</publisher>
   <distributor>NKJP Consortium</distributor>
   <availability>
    <p>For searching over the Internet (unless availability status described as "restricted" in the text header) and for the internal use of NKJP partners only.</p>
    <p>Particular headers may contain more specific information or may override the information given here.</p>
   </availability>
   <date when="2010-12-31">31 December 2010 (expected publication date)</date>
  </publicationStmt>
  
  <sourceDesc>
   <p>The origin of texts in NKJP may be:
    <list type="bulleted">
     <item>the IPI PAN Corpus</item>
     <item>the PELCRA Corpus</item>
     <item>the PWN Corpus</item>
     <item>the IJP PAN Corpus</item>
     <item>texts collected by IJP PAN, PELCRA and PWN specifically for NKJP.</item>
    </list>
   </p>
   <p>See sourceDesc/bibl/note[@text_origin] in particular header.xml files.</p>
  </sourceDesc>
 </fileDesc>

 <profileDesc>
  <langUsage>
   <language ident="pl">Polish</language>
   <language ident="en">English</language>
  </langUsage>
 </profileDesc>
 
 <encodingDesc>

  <projectDesc>
   <p>A linguistic corpus is a collection of texts where one can find the typical use of a single word or a phrase, as well as their meaning and grammatical function. Nowadays, without access to a language corpus, it has become impossible to do linguistic research, to write dictionaries, grammars and language teaching books, to create search engines sensitive to Polish inflexion, machine translation engines and software of advanced language technology. Language corpora have become an essential tool for linguists, but they are also helpful for software engineers, scholars of literature and culture, historians, librarians and other specialists of art and computer sciences.</p>
   <p>There already exist national corpora compiled by the <ref target="http://www.natcorp.ox.ac.uk">British</ref>, <ref target="http://www.ids-mannheim.de/kl/projekte/korpora/">Germans</ref>, <ref target="http://ucnk.ff.cuni.cz/english/index.html">Czech</ref> and <ref target="http://www.ruscorpora.ru/en/index.html">Russians</ref>. Polish people also need an extensive, well balanced language corpus – a language source which can be accessed online.</p>
   <p>The National Corpus of Polish is a shared initiative of four institutions: <ref target="http://www.ipipan.waw.pl/">Institute of Computer Science</ref> at the Polish Academy of Sciences (coordinator), <ref target="http://www.ijp-pan.krakow.pl">Institute of Polish Language</ref> at the Polish Academy of Sciences, <ref target="http://www.pwn.pl">Polish Scientific Publishers PWN</ref>, and the Department of Computational and Corpus Linguistics at the <ref target="http://www.uni.lodz.pl/">University of Łódź</ref>. It has been registered as a research-development project of <ref target="http://www.nauka.gov.pl">the Ministry of Science and Higher Education</ref>.</p>
   <p>These four institutions have started cooperation to build a reference corpus of Polish language containing hundreds millions of words. The corpus that will appear soon on this site will be searchable by means of advanced tools that analyse Polish inflection and the Polish sentence structure.</p>
   <p>The list of sources for the corpora contains classic literature, daily newspapers, specialist periodicals and journals, transcripts of conversations, and a variety of short-lived and internet texts. For a corpus to be reliable, not only it is necessary to contain a high number of words, but it also needs a diversity of texts with respect to the subject and genre. The conversations ought to represent both male and female speakers, in various age groups, coming from various regions in Poland.</p>
  </projectDesc> 

  <samplingDecl>
   <p>Whole texts are included, whenever possible.  They are split in text_structure.xml into frontmatter, body and backmatter, and only the part in the body is annotated linguistically.</p>
  </samplingDecl>

  <editorialDecl>
   <p>For privacy reasons, some of the people's names mentioned in the transcribed conversations have been modified. Uppercase characters are only used in popular proper nouns.</p>
   <p>Titles of spoken-conversational transcriptions were arbitrarily assigned by annotators. They are metadata elements separate from the contents of transcribed conversations.</p>
   <p>In case of obvious systematic code page conversion errors in some texts, corrections were made semi-automatically.</p>
  </editorialDecl>

  <tagsDecl>
   <namespace name="http://www.tei-c.org/ns/1.0">
    <tagUsage gi="residence">Used to denote the speaker's longest place of residence.  (Used only for spoken data.)</tagUsage> 
   </namespace>
   <namespace name="http://www.nkjp.pl/ns/1.0">
    <tagUsage gi="topic">The topic of a conversation (i.e., used only for spoken data).</tagUsage> 
   </namespace>
   <namespace name="http://www.nkjp.pl/ns/1.0">
    <tagUsage gi="fsLib">Contains feature and feature-value libraries.</tagUsage> 
   </namespace>
  </tagsDecl>

  <refsDecl>
   <p>The <att>xml:id</att> attribute of the <gi>teiHeader</gi> element for each text contains the unique identifier of that text in the whole NKJP corpus.  Its value should consist of 1) "IJPPAN", "IPIPAN", "PWN" or "PELCRA", 2) the underscore "_" and 3) a sequence of digits unique for texts from the partner indicated in 1).</p>
  </refsDecl>

  <!-- Each text should belong to one of the classes in the following
  taxonomy, not necessarily to one of the leaves (maximally specific
  categories).  For example, an Internet text which is neither a (part
  of a) forum nor a WWW page will be classified as #typ_internet. -->
  <classDecl>
   <taxonomy xml:id="taxonomy-NKJP-type">  
    <category xml:id="typ_lit"> <!-- target: 16% of the corpus -->
     <desc xml:lang="pl">literatura piękna</desc>
     <desc xml:lang="en">fiction</desc>
     <category xml:id="typ_lit_proza">
      <desc xml:lang="pl">proza</desc>
      <desc xml:lang="en">prose</desc>
     </category>
     <category xml:id="typ_lit_poezja">
      <desc xml:lang="pl">poezja</desc>
      <desc xml:lang="en">poetry</desc>
     </category>
     <category xml:id="typ_lit_dramat">
      <desc xml:lang="pl">dramat</desc>
      <desc xml:lang="en">drama</desc>
     </category>
    </category>
    <category xml:id="typ_fakt"> <!-- target: 5,5% of the corpus -->
     <desc xml:lang="pl">literatura faktu</desc>
     <desc xml:lang="en">non-fiction literature</desc>
    </category>
    <category xml:id="typ_publ"> <!-- target: 50% of the corpus; see the comment below -->
     <desc xml:lang="pl">publicystyka i wiadomości prasowe</desc>
     <desc xml:lang="en">journalism</desc>
    </category>
    <category xml:id="typ_nd"> <!-- target: 2% of the corpus -->
     <desc xml:lang="pl">naukowo-dydaktyczny</desc>
     <desc xml:lang="en">academic writing and textbooks</desc>
    </category>
    <category xml:id="typ_inf-por"> <!-- target: 5,5% of the corpus -->
     <desc xml:lang="pl">informacyjno-poradnikowy</desc>
     <desc xml:lang="en">instructive writing and guidebooks</desc>
    </category>
    <!-- the use of the category #typ_nklas should be avoided -->
    <category xml:id="typ_nklas"> <!-- target: no more than 1% of the corpus -->
     <desc xml:lang="pl">książka niebeletrystyczna niesklasyfikowana</desc>
     <desc xml:lang="en">unclassified non-fiction book</desc>
    </category>
    <category xml:id="typ_inne_pisane"> <!-- target: 3% of the corpus-->
     <desc xml:lang="pl">inne teksty pisane</desc>
     <desc xml:lang="en">miscellaneous (written)</desc>
     <category xml:id="typ_urzed">
      <desc xml:lang="pl">urzędowo-kancelaryjny</desc>
      <desc xml:lang="en">legal and official</desc> 
     </category>
     <category xml:id="typ_persw">
      <desc xml:lang="pl">teksty perswazyjne (ogłoszenia, reklamy, propaganda polityczna)</desc>
      <desc xml:lang="en">advertisements, announcements, political marketing</desc> 
     </category>
     <category xml:id="typ_instr">
      <desc xml:lang="pl">krótkie teksty instruktażowe</desc>
      <desc xml:lang="en">user manuals</desc> 
     </category>
     <category xml:id="typ_listy">
      <desc xml:lang="pl">listy</desc>
      <desc xml:lang="en">letters</desc> 
     </category>
    </category>
    <category xml:id="typ_internet"> <!-- target: 7% of the corpus -->
     <desc xml:lang="pl">Internet</desc>
     <desc xml:lang="en">Internet</desc>
     <category xml:id="typ_net_interakt">
      <desc xml:lang="pl">interaktywne (fora, czaty, krótkie wiadomości tekstowe, listy dyskusyjne itp.)</desc>
      <desc xml:lang="en">interactive (forums, chat rooms, instant messaging, mailing lists)</desc> 
     </category>
     <category xml:id="typ_net_nieinterakt">
      <desc xml:lang="pl">statyczne strony WWW</desc>
      <desc xml:lang="en">static WWW pages</desc> 
     </category>
    </category>		
    <!-- cumulative target for the following three spoken categories: 10% of the corpus -->
    <category xml:id="typ_konwers">
     <desc xml:lang="pl">konwersacyjne</desc>
     <desc xml:lang="en">conversational</desc>
    </category>
    <category xml:id="typ_media">
     <desc xml:lang="pl">mówione medialne</desc>
     <desc xml:lang="en">spoken from the media</desc>
    </category>
    <category xml:id="typ_qmow">
     <desc xml:lang="pl">quasi-mówione</desc>
     <desc xml:lang="en">quasi-spoken</desc>
    </category>
   </taxonomy>
  </classDecl>

  <!-- The genre "publicystyka i teksty prasowe" (Eng.: "journalism")
  consists of 51% texts taken from dailies, 47% texts taken from
  magazines and 2% texts taken from journalistic books.

  A journalistic text in a daily is defined as a text which is
  labelled in the header by the combination of 1) <catRef
  scheme="#taxonomy-NKJP-type" target="#typ_prasa"> and 2) <catRef
  scheme="#taxonomy-NKJP-channel" target="#kanal_prasa_dziennik">.

  A journalistic text in a magazine is defined in the header by the
  combination of 1) <catRef scheme="#taxonomy-NKJP-type"
  target="#typ_prasa"> and 2) <catRef scheme="#taxonomy-NKJP-channel"
  target="#kanal_prasa"> or one of its subcategories, with the
  exception of <catRef scheme="#taxonomy-NKJP-channel"
  target="#kanal_dziennik">.

  A journalistic book is defined by the combination of 1) <catRef
  scheme="#taxonomy-NKJP-type" target="#typ_prasa"> and 2) <catRef
  scheme="#taxonomy-NKJP-channel" target="#kanal_ksiazka">.

  In total texts taken from dailies, magazines and books make up
  25,5%, 23,5% and 1% of the entire corpus, respectively. This
  selection of channels of texts is done in order to assure
  representativeness of the corpus. -->


  <!-- Each text should belong to one of the *leaves* (maximally
  specific categories) in the following taxonomy. -->
  <classDecl>
   <taxonomy xml:id="taxonomy-NKJP-channel">  
    <category xml:id="kanal_prasa">
     <desc xml:lang="pl">prasa</desc>
     <desc xml:lang="en">press</desc>
     <category xml:id="kanal_prasa_dziennik">
      <desc xml:lang="pl">dziennik</desc>
      <desc xml:lang="en">daily</desc>
     </category>
     <category xml:id="kanal_prasa_tygodnik">
      <desc xml:lang="pl">tygodnik</desc>
      <desc xml:lang="en">weekly</desc>
     </category>
     <category xml:id="kanal_prasa_miesiecznik">
      <desc xml:lang="pl">miesiecznik</desc>
      <desc xml:lang="en">monthly</desc>
     </category>
     <category xml:id="kanal_prasa_inne"> <!-- e.g., bi-weekly or occasional -->
      <desc xml:lang="pl">inne prasowe</desc>
      <desc xml:lang="en">other press</desc>
     </category>
    </category>
    <category xml:id="kanal_ksiazka">
     <desc xml:lang="pl">książka</desc>
     <desc xml:lang="en">book</desc>
    </category>
    <category xml:id="kanal_internet">
     <desc xml:lang="pl">Internet</desc>
     <desc xml:lang="en">internet</desc>
    </category>
    <category xml:id="kanal_mowiony">
     <desc xml:lang="pl">mówiony</desc>
     <desc xml:lang="en">spoken</desc>
    </category>
    <category xml:id="kanal_ulotka">
     <desc xml:lang="pl">ulotki, ogłoszenia, reklamy</desc>
     <desc xml:lang="en">leaflets, announcemnets, ads</desc>
    </category>
    <category xml:id="kanal_rkps">
     <desc xml:lang="pl">rękopis lub maszynopis</desc>
     <desc xml:lang="en">manuscript</desc>
    </category>
   </taxonomy>
  </classDecl>

  <classDecl>
   <taxonomy xml:id="ukd">
    <bibl>
     <title xml:lang="pl">Uniwersalna Klasyfikacja Dziesiętna</title>
     <title xml:lang="en">Universal Decimal Classification</title>
     <edition>UDC-P058</edition>
    </bibl>
   </taxonomy>
  </classDecl>

  <classDecl>
   <taxonomy xml:id="bn">
    <bibl>
     <title xml:lang="pl">Klasyfikacja Biblioteki Narodowej</title>
     <title xml:lang="en">Polish National Library Classification</title>
     <edition xml:lang="pl">Słownik języka haseł przedmiotowych Biblioteki Narodowej. Wyd. 5 popr. i rozsz., stan na dzień 31 grudnia 2004 roku.</edition>
    </bibl>
   </taxonomy>
  </classDecl>

  <nkjp:fsLib>
   <fLib n="tools">
    <f xml:id="pantera" name="tool">
     <string>PANTERA Tagger (April 2011)</string>
    </f>
   </fLib>
  </nkjp:fsLib>

 </encodingDesc>

 <revisionDesc> <!-- Only changes to this header recorded below. -->
  <change who="#adamp" when="2009-06-25">Header accepted by NKJP partners on 25 June 2009, after a discussion at the NKJP meeting in Cracow on 19 June 2009 and ensuing modifications.  Any further changes to this header 1) must be recorded below, 2) must be monotonic (cannot necessitate changes in the encoding of existing files).  Any further changes to schemata validating this header must also be monotonic (previous versions of the header must still validate).</change>
  <change who="#adamp" when="2009-07-31">Added the <gi>fsLib</gi> to <gi>encodingDesc</gi> with features defined for two versions of the Anotatornia tool.</change>
  <change who="#adamp" when="2009-08-01">Added <gi>profileDesc</gi>.</change>
  <change who="#adamp" when="2009-08-23">Changed <gi>catDesc</gi> to <gi>desc</gi>, as only one <gi>catDesc</gi> is allowed within <gi>category</gi>, but multiple <gi>desc</gi> elements may occur there.  Without this change this apparently wouldn't be a TEI document (not even TEI Extension).  This looks like a bug in TEI Guidelines, so it has been <ref target="https://sourceforge.net/tracker/index.php?func=detail&amp;aid=2843046&amp;group_id=106328&amp;atid=644062">reported</ref> in the TEI Bug Tracker.</change>
  <change who="#adamp" when="2009-08-23">Various <gi>respStmt</gi> additions and modifications.</change>
  <change who="#adamp" when="2009-08-23">Reference to ann_structure.xml in <gi>samplingDecl</gi> replaced by reference to text_structure.xml.</change>
 </revisionDesc>
 
</teiHeader>

