https://github.com/opener-project/kaf
Raw File
Tip revision: 171be1314bd1415dae9f79390d0e08f23a7c46d4 authored by aitor-garcia-p on 09 July 2013, 12:32:05 UTC
Update message about the wiki
Tip revision: 171be13
kaf-21.dtd
<!-- DTD KAF 2.1 -->
<!ELEMENT KAF (kafHeader|text|terms|deps|chunks|entities|coreferences|features|relations|opinions)*>

<!-- KAF ELEMENT -->
<!ATTLIST KAF
	  doc CDATA #IMPLIED
	  xml:lang CDATA #IMPLIED
	  version CDATA #IMPLIED>

<!-- KAFHEADER ELEMENT -->
<!ELEMENT kafHeader (fileDesc?,public?,linguisticProcessors*)>

<!-- FILEDESC ELEMENT -->
<!--
<fileDesc> is an empty element containing information about the
computer document itself. It has the following attributes:

- title: the title of the document (optional).
- author: the author of the document (optional).
- creationtime: when the document was created. In ISO 8601. (optional)
- filename: the original file name (optional).
- filetype: the original format (PDF, HTML, DOC, etc) (optional).
- pages: number of pages of the original document (optional).
-->

<!ELEMENT fileDesc EMPTY>
<!ATTLIST fileDesc
          title CDATA #IMPLIED
          author CDATA #IMPLIED
          creationtime CDATA #IMPLIED
		  filename CDATA #IMPLIED
          filetype CDATA #IMPLIED
          pages CDATA #IMPLIED>

<!-- PUBLIC ELEMENT -->
<!-- 
<public> is an empty element which stores public information about
the document, such as its URI. It has the following attributes:

- publicId: a public identifier (for instance, the number inserted by the capture server) (optional).
- uri: a public URI of the document (optional).

-->

<!ELEMENT public EMPTY>
<!ATTLIST public
          publicId CDATA #IMPLIED
          uri CDATA #IMPLIED>

		  
<!-- LINGUISTICPROCESSORS ELEMENT -->
<!--  
<linguisticProcessors> elements store the information about which linguistic processors
produced the KAF document. There can be several <linguisticProcessors> elements, one
per KAF layer. KAF layers correspond to the top-level elements of the
documents, such as "text", "terms", "deps" etc.

-->

<!ELEMENT linguisticProcessors (lp)+>
<!ATTLIST linguisticProcessors
          layer CDATA #REQUIRED>


<!-- LP ELEMENT -->
<!-- 
<lp> elements describe one specific linguistic processor. <lp> elements 
have the following attributes:

- name: the name of the processor
- version: processor's version
- timestamp: a timestamp, denoting the date/time at which the processor was
  launched. The timestamp follows the XML Schema xs:dateTime type (See
  http://www.w3.org/TR/xmlschema-2/#isoformats). In summary, the date is
  specified following the form "YYYY-MM-DDThh:mm:ss" (all fields
  required). To specify a time zone, you can either enter a dateTime in UTC
  time by adding a "Z" behind the time ("2002-05-30T09:00:00Z") or you can
  specify an offset from the UTC time by adding a positive or negative time
  behind the time ("2002-05-30T09:00:00+06:00").
-->

<!ELEMENT lp EMPTY>
<!ATTLIST lp
          name CDATA #REQUIRED
          version CDATA #REQUIRED
          timestamp CDATA #REQUIRED>


<!-- TEXT ELEMENT -->
<!ELEMENT text (wf)+>

<!-- WORDFORM ELEMENT -->
<!--
<wf> elements describe and contain all word foorms generated after the tokenization step
<wf> elements have the following attributes:
- wid: the id of the word form (REQUIRED and UNIQUE)
- sent: sentence id of the word form (optional)
- para: paragraph id of the word form (optional)
- page: page id of the word form (optional)
- offset: the offset (in characters) of the word form (optional)
- length: the length (in characters) of the word form (optional)
- xpath: in case of source xml files, the xpath expression identifying the original word form (optional)

-->
<!ELEMENT wf (#PCDATA)>
<!ATTLIST wf
	  wid ID #REQUIRED
	  sent CDATA #IMPLIED
	  para CDATA #IMPLIED
	  page CDATA #IMPLIED
	  offset CDATA #IMPLIED
	  length CDATA #IMPLIED
	  xpath CDATA #IMPLIED>

<!-- TERMS ELEMENT -->
<!ELEMENT terms (term)+>

<!-- TERM ELEMENT -->
<!--
    attributes of term elements

    tid: unique identifier (REQUIRED AND UNIQUE)

    type: type of the term. (REQUIRED) Currently, 3 values are possible:
       open: open category term
       close: close category term

    lemma: lemma of the term (REQUIRED).

    pos: part of speech. (REQUIRED) The first letter of the pos attribute
          must be one of the following:

	  N	common noun
	  R	proper noun
	  G	adjective
	  V	verb
	  P	preposition
	  A	adverb
	  C	conjunction
	  D	determiner
	  O	other

	  more complex pos attributes may be formed by concatenating values separated
	  by a dot ".". For example, in Basque we have "V.ADI.SIN" for simple verbs
	  or "V.ADI.KON" for complex verbs.

    morphofeat: if the term is a named entity (type="entity"), the type of the entity (otpional).

    case: declension case of the term (otpional).

    head: if the term is a compound, the id of the head component (otpional).
-->
<!ELEMENT term (sentiment?|span|externalReferences|component)+>
<!ATTLIST term
	  tid ID #REQUIRED
	  type CDATA #REQUIRED
	  lemma CDATA #REQUIRED
	  pos CDATA #REQUIRED
	  morphofeat CDATA #IMPLIED
	  case CDATA #IMPLIED
	  head CDATA #IMPLIED>



	  
<!-- SENTIMENT FEATURES ELEMENTS -->
<!--
<sentiment> elements have the following sub-element:
-	Resource: identifier and reference to an external sentiment resource 
-	Polarity: Refers to the property of a word to express positive, negative or no sentiment. These values are possible: 
	-	Positive
	-	Negative
	-	Neutral
	-	Or numerical value on a numerical scale
-	Strength: refers to the strength of the polarity
	-	Weak
	-	Average
	-	Strong
	-	Or Numerical value
-	Subjectivity: refers to the property of a words to express an opionion (or not)
	-	Subjective/Objective
	-	Factual/opinionated
-	Sentiment_semantic_type: refers to a sentiment-related semantic type
	-	Aesthetics_evaluation
	-	Moral_judgment
	-	Emotion
	-	etc
-	Sentiment modifier: refers to words which modify  the polarity of another word
	-	Intensifier/weakener polarity shifter
-	Sentiment_marker: refers to words which themselves do not carry polarity, but are kind of vehicles of it
	-	Find, think, in my opinion, according to....
-	Sentiment_product_feature: refers to a domain; mainly used in feature-based sentiment analysis
	-	Values are related to specific domain. For the tourist domain, for example, staff, cleanliness, beds, bathroom, transportation, location, etc.. 
-->
<!ELEMENT sentiment EMPTY>
<!ATTLIST sentiment
	  resource CDATA #IMPLIED
	  polarity CDATA #IMPLIED
	  strength CDATA #IMPLIED
	  subjectivity CDATA #IMPLIED
	  sentiment_semantic_type CDATA #IMPLIED
	  sentiment_product_feature CDATA #IMPLIED
	  sentiment_modifier CDATA #IMPLIED
	  sentiment_marker CDATA #IMPLIED>


<!-- EXTERNALREFERENCES ELEMENT -->	  
<!--
The <externalReferences> element is used to associate terms to
external resources, such as elements of a Knowledge base, an ontology,
etc. It consists of several <externalRef> elements, one per
association.
-->

<!ELEMENT externalReferences (externalRef)+>

<!-- EXTERNALREF ELEMENT -->
<!-- 
<externalRef> elements have the following attributes:
- resource: indicates the identifier of the resource referred to. 
- reference: code of the referred element. If the element is a
  synset of some version of WordNet, it follows the pattern:

    [a-z]{3}-[0-9]{2}-[0-9]+-[nvars]

  which is a string composed by four fields separated by a dash. 
  The four fields are the following:


  - Language code (three characters).
  - WordNet version (two digits).
  - Synset identifier composed by digits.
  - POS character:
    n noun
    v verb
    a adjective
    r adverb
  examples of valid patterns are: ``ENG-20-12345678-n'', 
  ``SPA-16-017403-v'', etc.
- confidence: a floating number between 0 and 1. Indicates the confidence weight of the association
-->
<!ELEMENT externalRef EMPTY>
<!ATTLIST externalRef
          resource CDATA #REQUIRED
          reference CDATA #REQUIRED
		  confidence CDATA #IMPLIED>

<!-- COMPONENT ELEMENT -->
<!--
Compound and multiword terms can be represented in KAF by including <component> elements within <term> elements.
The <component> elements have the following attributes:
-	id: unique identifier (REQUIRED and UNIQUE)
-	lemma: lemma of the term (REQUIRED)
-	pos: part of speech (REQUIRED)
-	case: declension case (optional) 

-->
<!ELEMENT component (externalReferences)*>
<!ATTLIST component
	  id ID #REQUIRED
	  lemma CDATA #REQUIRED
	  pos CDATA #REQUIRED
	  case CDATA #IMPLIED>

<!-- DEPS ELEMENT -->
<!ELEMENT deps (dep)+>

<!-- DEP ELEMENT -->
<!--
The <dep> elements have the following attributes:
-	from: term id of the source element (REQUIRED)
-	to: term id of the target element (REQUIRED)
-	rfunc: relational function.(REQUIRED)
-   case: declension case (optional)
-->
<!ELEMENT dep EMPTY>
<!ATTLIST dep
	  from IDREF #REQUIRED
	  to IDREF #REQUIRED
	  rfunc CDATA #REQUIRED
      case CDATA #IMPLIED>

<!-- CHUNKS ELEMENT -->
<!ELEMENT chunks (chunk)+>
<!-- CHUNK ELEMENT -->
<!--
The <chunk> elements have the following attributes:
-	cid: unique identifier (REQUIRED)
-	head: the chunk head’s term id  (REQUIRED)
-	phrase: type of the phrase (REQUIRED)
-	case: declension case (optional)
-->
<!ELEMENT chunk (span)+>
<!ATTLIST chunk
	  cid ID #REQUIRED
	  head IDREF #REQUIRED
	  phrase CDATA #REQUIRED
	  case CDATA #IMPLIED>

<!-- ENTITIES ELEMENT -->
<!ELEMENT entities (entity)+>

<!-- ENTITY ELEMENT -->
<!--
A named entity element has the following attributes: 
-	eid: the id for the named entity (REQUIRED)
-	type:  type of the named entity. (REQUIRED) Currently, 8 values are possible: 
	-	Person
	-	Organization
	-	Location
	-	Date
	-	Time
	-	Money
	-	Percent
	-	Misc 
-->
<!ELEMENT entity (references)+>
<!ATTLIST entity
	  eid ID #REQUIRED
	  type CDATA #REQUIRED>

<!-- COREFERENCES ELEMENT -->
<!ELEMENT coreferences (coref)+>

<!-- COREF ELEMENT -->
<!--
<coref> element has the following attribute:
-	coid: unique id, starting with the prefix “co”
-->
<!ELEMENT coref (references)+>
<!ATTLIST coref
	  coid ID #REQUIRED>
	  
	  

<!-- FEATURES OF SENTIMENT ELEMENT -->
<!--
<features> element may contain a <properties> element and a <categories> element..

<properties> element contains one or more <property> elements. A <property> element has the following attributes:
-	pid: the unique identifier of the property
-	lemma: lemma of the property
<categories> element contains one or more <category> elements. A <category> element has the following attributes:
-	cid: the unique identifier of the category
-	lemma: lemma of the category
<property> and <category> elements have the following sub-elements: 
-	references: this element contains one or more reference elements
-	externalReferences (optional): this element contains one or more externalRef elements
-->
<!ELEMENT features (properties*, categories*)>
<!ELEMENT properties (property+)>
<!ELEMENT categories (category+)>

<!ELEMENT property (references)+>
<!ATTLIST property
	  pid ID #REQUIRED
	  lemma CDATA #REQUIRED>
	  
<!ELEMENT category (references)+>
<!ATTLIST category
	  cid ID #REQUIRED
	  lemma CDATA #REQUIRED>
	  

<!-- RELATIONS ELEMENT -->
<!ELEMENT relations (relation)+>

<!-- RELATION ELEMENT -->
<!--
An element <relation> contains these attributes:
-	rid: the unique identifier of the relation between two entities
-	from: entity/category/property id of the source element
-	to: entity/category/property id of the target element
-	confidence: (optional): a floating number between 0 and 1. Indicates the confidence weight of the relation
-->
<!ELEMENT relation EMPTY>
<!ATTLIST relation
	  rid ID #REQUIRED
	  from IDREF #REQUIRED
	  to IDREF #REQUIRED
	  confidence CDATA #IMPLIED>



<!-- OPINIONS ELEMENT -->
<!ELEMENT opinions (opinion)+>

<!-- OPINION ELEMENT -->
<!--
The <opinion> layer has one attribute: 
-	oid:  the unique identifier of the opinion

The <opinion> layer consists of the following subelement:
-	opinion_holder:  whose opinion: speaker or some actor in the text
-	opinion _target :  about what
-	opinion_expression: the expression

-->
<!ELEMENT opinion (opinion_holder, opinion_target, opinion_expression)+>
<!ATTLIST opinion
	  oid ID #REQUIRED>

<!-- OPINION_HOLDER AND OPINION_TARGET ELEMENT -->
<!--
<opinion_holder> and <opinion_target> elements have the following sub-element:
-	span: this element spans the target  term. Target elements are used to refer to the target term,, using term ids (tid). If the term is a multiword, multiple target elements are used.
-->	  
<!ELEMENT opinion_holder (span)+>
<!ELEMENT opinion_target (span)+>

<!-- OPINION_EXPRESSION -->
<!--
<opinion_expression> has the following attributes:
-	polarity:  refers to the positive or negative orientation of the expression
-	strength:  refers to the strength of the expression
-	subjectivity:  refers to whether an expression is subjective or not
-	sentiment_semantic_type:  refers to sentiment related semantic types like emotion, judgment, belief, speculation
-	sentiment_product_feature :  refers to specific features of entities, to be used in feature/aspect-based sentiment analysis
-->
<!ELEMENT opinion_expression (span)+>
<!ATTLIST opinion_expression
	  polarity CDATA #IMPLIED
	  strength CDATA #IMPLIED
	  subjectivity CDATA #IMPLIED
	  sentiment_semantic_type CDATA #IMPLIED
	  sentiment_product_feature CDATA #IMPLIED>
	  
<!-- REFERENCES AND SPANS  -->	  
<!-- REFERENCES ELEMENT -->
<!ELEMENT references (span)+>

<!-- SPAN ELEMENT -->
<!ELEMENT span (target)+>

<!-- TARGET ELEMENT -->
<!ELEMENT target EMPTY>
<!ATTLIST target
	  id IDREF #REQUIRED
	  head CDATA #IMPLIED>


<!-- OLD ELEMENTS DEPRECATED -->
<!ELEMENT events (event)+>
<!ELEMENT event (roles)+>
<!ATTLIST event
	  eid ID #REQUIRED
	  span IDREF #REQUIRED
	  lemma CDATA #REQUIRED
	  pos CDATA #REQUIRED
	  eiid CDATA #IMPLIED
	  class CDATA #IMPLIED
	  tense CDATA #IMPLIED
	  aspect CDATA #IMPLIED
	  polarity CDATA #IMPLIED>

<!ELEMENT roles (role)+>
<!ELEMENT role EMPTY>
<!ATTLIST role
	  cid IDREF #REQUIRED
	  role CDATA #REQUIRED>

<!ELEMENT quantifiers (quantifier)+>
<!ELEMENT quantifier (span)+>
<!ATTLIST quantifier
	  qid ID #REQUIRED>

back to top