https://github.com/opener-project/kaf
Tip revision: 171be1314bd1415dae9f79390d0e08f23a7c46d4 authored by aitor-garcia-p on 09 July 2013, 12:32:05 UTC
Update message about the wiki
Update message about the wiki
Tip revision: 171be13
kaf-21.dtd
<!-- DTD KAF 2.1 -->
<!ELEMENT KAF (kafHeader|text|terms|deps|chunks|entities|coreferences|features|relations|opinions)*>
<!-- KAF ELEMENT -->
<!ATTLIST KAF
doc CDATA #IMPLIED
xml:lang CDATA #IMPLIED
version CDATA #IMPLIED>
<!-- KAFHEADER ELEMENT -->
<!ELEMENT kafHeader (fileDesc?,public?,linguisticProcessors*)>
<!-- FILEDESC ELEMENT -->
<!--
<fileDesc> is an empty element containing information about the
computer document itself. It has the following attributes:
- title: the title of the document (optional).
- author: the author of the document (optional).
- creationtime: when the document was created. In ISO 8601. (optional)
- filename: the original file name (optional).
- filetype: the original format (PDF, HTML, DOC, etc) (optional).
- pages: number of pages of the original document (optional).
-->
<!ELEMENT fileDesc EMPTY>
<!ATTLIST fileDesc
title CDATA #IMPLIED
author CDATA #IMPLIED
creationtime CDATA #IMPLIED
filename CDATA #IMPLIED
filetype CDATA #IMPLIED
pages CDATA #IMPLIED>
<!-- PUBLIC ELEMENT -->
<!--
<public> is an empty element which stores public information about
the document, such as its URI. It has the following attributes:
- publicId: a public identifier (for instance, the number inserted by the capture server) (optional).
- uri: a public URI of the document (optional).
-->
<!ELEMENT public EMPTY>
<!ATTLIST public
publicId CDATA #IMPLIED
uri CDATA #IMPLIED>
<!-- LINGUISTICPROCESSORS ELEMENT -->
<!--
<linguisticProcessors> elements store the information about which linguistic processors
produced the KAF document. There can be several <linguisticProcessors> elements, one
per KAF layer. KAF layers correspond to the top-level elements of the
documents, such as "text", "terms", "deps" etc.
-->
<!ELEMENT linguisticProcessors (lp)+>
<!ATTLIST linguisticProcessors
layer CDATA #REQUIRED>
<!-- LP ELEMENT -->
<!--
<lp> elements describe one specific linguistic processor. <lp> elements
have the following attributes:
- name: the name of the processor
- version: processor's version
- timestamp: a timestamp, denoting the date/time at which the processor was
launched. The timestamp follows the XML Schema xs:dateTime type (See
http://www.w3.org/TR/xmlschema-2/#isoformats). In summary, the date is
specified following the form "YYYY-MM-DDThh:mm:ss" (all fields
required). To specify a time zone, you can either enter a dateTime in UTC
time by adding a "Z" behind the time ("2002-05-30T09:00:00Z") or you can
specify an offset from the UTC time by adding a positive or negative time
behind the time ("2002-05-30T09:00:00+06:00").
-->
<!ELEMENT lp EMPTY>
<!ATTLIST lp
name CDATA #REQUIRED
version CDATA #REQUIRED
timestamp CDATA #REQUIRED>
<!-- TEXT ELEMENT -->
<!ELEMENT text (wf)+>
<!-- WORDFORM ELEMENT -->
<!--
<wf> elements describe and contain all word foorms generated after the tokenization step
<wf> elements have the following attributes:
- wid: the id of the word form (REQUIRED and UNIQUE)
- sent: sentence id of the word form (optional)
- para: paragraph id of the word form (optional)
- page: page id of the word form (optional)
- offset: the offset (in characters) of the word form (optional)
- length: the length (in characters) of the word form (optional)
- xpath: in case of source xml files, the xpath expression identifying the original word form (optional)
-->
<!ELEMENT wf (#PCDATA)>
<!ATTLIST wf
wid ID #REQUIRED
sent CDATA #IMPLIED
para CDATA #IMPLIED
page CDATA #IMPLIED
offset CDATA #IMPLIED
length CDATA #IMPLIED
xpath CDATA #IMPLIED>
<!-- TERMS ELEMENT -->
<!ELEMENT terms (term)+>
<!-- TERM ELEMENT -->
<!--
attributes of term elements
tid: unique identifier (REQUIRED AND UNIQUE)
type: type of the term. (REQUIRED) Currently, 3 values are possible:
open: open category term
close: close category term
lemma: lemma of the term (REQUIRED).
pos: part of speech. (REQUIRED) The first letter of the pos attribute
must be one of the following:
N common noun
R proper noun
G adjective
V verb
P preposition
A adverb
C conjunction
D determiner
O other
more complex pos attributes may be formed by concatenating values separated
by a dot ".". For example, in Basque we have "V.ADI.SIN" for simple verbs
or "V.ADI.KON" for complex verbs.
morphofeat: if the term is a named entity (type="entity"), the type of the entity (otpional).
case: declension case of the term (otpional).
head: if the term is a compound, the id of the head component (otpional).
-->
<!ELEMENT term (sentiment?|span|externalReferences|component)+>
<!ATTLIST term
tid ID #REQUIRED
type CDATA #REQUIRED
lemma CDATA #REQUIRED
pos CDATA #REQUIRED
morphofeat CDATA #IMPLIED
case CDATA #IMPLIED
head CDATA #IMPLIED>
<!-- SENTIMENT FEATURES ELEMENTS -->
<!--
<sentiment> elements have the following sub-element:
- Resource: identifier and reference to an external sentiment resource
- Polarity: Refers to the property of a word to express positive, negative or no sentiment. These values are possible:
- Positive
- Negative
- Neutral
- Or numerical value on a numerical scale
- Strength: refers to the strength of the polarity
- Weak
- Average
- Strong
- Or Numerical value
- Subjectivity: refers to the property of a words to express an opionion (or not)
- Subjective/Objective
- Factual/opinionated
- Sentiment_semantic_type: refers to a sentiment-related semantic type
- Aesthetics_evaluation
- Moral_judgment
- Emotion
- etc
- Sentiment modifier: refers to words which modify the polarity of another word
- Intensifier/weakener polarity shifter
- Sentiment_marker: refers to words which themselves do not carry polarity, but are kind of vehicles of it
- Find, think, in my opinion, according to....
- Sentiment_product_feature: refers to a domain; mainly used in feature-based sentiment analysis
- Values are related to specific domain. For the tourist domain, for example, staff, cleanliness, beds, bathroom, transportation, location, etc..
-->
<!ELEMENT sentiment EMPTY>
<!ATTLIST sentiment
resource CDATA #IMPLIED
polarity CDATA #IMPLIED
strength CDATA #IMPLIED
subjectivity CDATA #IMPLIED
sentiment_semantic_type CDATA #IMPLIED
sentiment_product_feature CDATA #IMPLIED
sentiment_modifier CDATA #IMPLIED
sentiment_marker CDATA #IMPLIED>
<!-- EXTERNALREFERENCES ELEMENT -->
<!--
The <externalReferences> element is used to associate terms to
external resources, such as elements of a Knowledge base, an ontology,
etc. It consists of several <externalRef> elements, one per
association.
-->
<!ELEMENT externalReferences (externalRef)+>
<!-- EXTERNALREF ELEMENT -->
<!--
<externalRef> elements have the following attributes:
- resource: indicates the identifier of the resource referred to.
- reference: code of the referred element. If the element is a
synset of some version of WordNet, it follows the pattern:
[a-z]{3}-[0-9]{2}-[0-9]+-[nvars]
which is a string composed by four fields separated by a dash.
The four fields are the following:
- Language code (three characters).
- WordNet version (two digits).
- Synset identifier composed by digits.
- POS character:
n noun
v verb
a adjective
r adverb
examples of valid patterns are: ``ENG-20-12345678-n'',
``SPA-16-017403-v'', etc.
- confidence: a floating number between 0 and 1. Indicates the confidence weight of the association
-->
<!ELEMENT externalRef EMPTY>
<!ATTLIST externalRef
resource CDATA #REQUIRED
reference CDATA #REQUIRED
confidence CDATA #IMPLIED>
<!-- COMPONENT ELEMENT -->
<!--
Compound and multiword terms can be represented in KAF by including <component> elements within <term> elements.
The <component> elements have the following attributes:
- id: unique identifier (REQUIRED and UNIQUE)
- lemma: lemma of the term (REQUIRED)
- pos: part of speech (REQUIRED)
- case: declension case (optional)
-->
<!ELEMENT component (externalReferences)*>
<!ATTLIST component
id ID #REQUIRED
lemma CDATA #REQUIRED
pos CDATA #REQUIRED
case CDATA #IMPLIED>
<!-- DEPS ELEMENT -->
<!ELEMENT deps (dep)+>
<!-- DEP ELEMENT -->
<!--
The <dep> elements have the following attributes:
- from: term id of the source element (REQUIRED)
- to: term id of the target element (REQUIRED)
- rfunc: relational function.(REQUIRED)
- case: declension case (optional)
-->
<!ELEMENT dep EMPTY>
<!ATTLIST dep
from IDREF #REQUIRED
to IDREF #REQUIRED
rfunc CDATA #REQUIRED
case CDATA #IMPLIED>
<!-- CHUNKS ELEMENT -->
<!ELEMENT chunks (chunk)+>
<!-- CHUNK ELEMENT -->
<!--
The <chunk> elements have the following attributes:
- cid: unique identifier (REQUIRED)
- head: the chunk head’s term id (REQUIRED)
- phrase: type of the phrase (REQUIRED)
- case: declension case (optional)
-->
<!ELEMENT chunk (span)+>
<!ATTLIST chunk
cid ID #REQUIRED
head IDREF #REQUIRED
phrase CDATA #REQUIRED
case CDATA #IMPLIED>
<!-- ENTITIES ELEMENT -->
<!ELEMENT entities (entity)+>
<!-- ENTITY ELEMENT -->
<!--
A named entity element has the following attributes:
- eid: the id for the named entity (REQUIRED)
- type: type of the named entity. (REQUIRED) Currently, 8 values are possible:
- Person
- Organization
- Location
- Date
- Time
- Money
- Percent
- Misc
-->
<!ELEMENT entity (references)+>
<!ATTLIST entity
eid ID #REQUIRED
type CDATA #REQUIRED>
<!-- COREFERENCES ELEMENT -->
<!ELEMENT coreferences (coref)+>
<!-- COREF ELEMENT -->
<!--
<coref> element has the following attribute:
- coid: unique id, starting with the prefix “co”
-->
<!ELEMENT coref (references)+>
<!ATTLIST coref
coid ID #REQUIRED>
<!-- FEATURES OF SENTIMENT ELEMENT -->
<!--
<features> element may contain a <properties> element and a <categories> element..
<properties> element contains one or more <property> elements. A <property> element has the following attributes:
- pid: the unique identifier of the property
- lemma: lemma of the property
<categories> element contains one or more <category> elements. A <category> element has the following attributes:
- cid: the unique identifier of the category
- lemma: lemma of the category
<property> and <category> elements have the following sub-elements:
- references: this element contains one or more reference elements
- externalReferences (optional): this element contains one or more externalRef elements
-->
<!ELEMENT features (properties*, categories*)>
<!ELEMENT properties (property+)>
<!ELEMENT categories (category+)>
<!ELEMENT property (references)+>
<!ATTLIST property
pid ID #REQUIRED
lemma CDATA #REQUIRED>
<!ELEMENT category (references)+>
<!ATTLIST category
cid ID #REQUIRED
lemma CDATA #REQUIRED>
<!-- RELATIONS ELEMENT -->
<!ELEMENT relations (relation)+>
<!-- RELATION ELEMENT -->
<!--
An element <relation> contains these attributes:
- rid: the unique identifier of the relation between two entities
- from: entity/category/property id of the source element
- to: entity/category/property id of the target element
- confidence: (optional): a floating number between 0 and 1. Indicates the confidence weight of the relation
-->
<!ELEMENT relation EMPTY>
<!ATTLIST relation
rid ID #REQUIRED
from IDREF #REQUIRED
to IDREF #REQUIRED
confidence CDATA #IMPLIED>
<!-- OPINIONS ELEMENT -->
<!ELEMENT opinions (opinion)+>
<!-- OPINION ELEMENT -->
<!--
The <opinion> layer has one attribute:
- oid: the unique identifier of the opinion
The <opinion> layer consists of the following subelement:
- opinion_holder: whose opinion: speaker or some actor in the text
- opinion _target : about what
- opinion_expression: the expression
-->
<!ELEMENT opinion (opinion_holder, opinion_target, opinion_expression)+>
<!ATTLIST opinion
oid ID #REQUIRED>
<!-- OPINION_HOLDER AND OPINION_TARGET ELEMENT -->
<!--
<opinion_holder> and <opinion_target> elements have the following sub-element:
- span: this element spans the target term. Target elements are used to refer to the target term,, using term ids (tid). If the term is a multiword, multiple target elements are used.
-->
<!ELEMENT opinion_holder (span)+>
<!ELEMENT opinion_target (span)+>
<!-- OPINION_EXPRESSION -->
<!--
<opinion_expression> has the following attributes:
- polarity: refers to the positive or negative orientation of the expression
- strength: refers to the strength of the expression
- subjectivity: refers to whether an expression is subjective or not
- sentiment_semantic_type: refers to sentiment related semantic types like emotion, judgment, belief, speculation
- sentiment_product_feature : refers to specific features of entities, to be used in feature/aspect-based sentiment analysis
-->
<!ELEMENT opinion_expression (span)+>
<!ATTLIST opinion_expression
polarity CDATA #IMPLIED
strength CDATA #IMPLIED
subjectivity CDATA #IMPLIED
sentiment_semantic_type CDATA #IMPLIED
sentiment_product_feature CDATA #IMPLIED>
<!-- REFERENCES AND SPANS -->
<!-- REFERENCES ELEMENT -->
<!ELEMENT references (span)+>
<!-- SPAN ELEMENT -->
<!ELEMENT span (target)+>
<!-- TARGET ELEMENT -->
<!ELEMENT target EMPTY>
<!ATTLIST target
id IDREF #REQUIRED
head CDATA #IMPLIED>
<!-- OLD ELEMENTS DEPRECATED -->
<!ELEMENT events (event)+>
<!ELEMENT event (roles)+>
<!ATTLIST event
eid ID #REQUIRED
span IDREF #REQUIRED
lemma CDATA #REQUIRED
pos CDATA #REQUIRED
eiid CDATA #IMPLIED
class CDATA #IMPLIED
tense CDATA #IMPLIED
aspect CDATA #IMPLIED
polarity CDATA #IMPLIED>
<!ELEMENT roles (role)+>
<!ELEMENT role EMPTY>
<!ATTLIST role
cid IDREF #REQUIRED
role CDATA #REQUIRED>
<!ELEMENT quantifiers (quantifier)+>
<!ELEMENT quantifier (span)+>
<!ATTLIST quantifier
qid ID #REQUIRED>