# ABSTRACT: Driver for the Czech tagset of the CoNLL 2009 Shared Task.
# This tagset differs slightly from the Czech tagset of CoNLL 2006 and 2007.
# Copyright © 2009, 2014 Dan Zeman <zeman@ufal.mff.cuni.cz>

package Lingua::Interset::Tagset::CS::Conll2009;
use strict;
use warnings;
our $VERSION = '2.003'; # VERSION: generated by DZP::OurPkgVersion

use utf8;
use open ':utf8';
use namespace::autoclean;
use Moose;
extends 'Lingua::Interset::Tagset::CS::Conll';



#------------------------------------------------------------------------------
# Decodes a physical tag (string) and returns the corresponding feature
# structure.
#------------------------------------------------------------------------------
sub decode
{
    my $self = shift;
    my $tag = shift;
    my $old = _conll_2009_to_2006($tag);
    my $fs = $self->SUPER::decode($old);
    # Here we could set $fs->set_tagset('cs::conll2009') but we will not so that all
    # the descendants of cs::pdt can share the same feature structures.
    return $fs;
}



#------------------------------------------------------------------------------
# Takes feature structure and returns the corresponding physical tag (string).
#------------------------------------------------------------------------------
sub encode
{
    my $self = shift;
    my $fs = shift; # Lingua::Interset::FeatureStructure
    my $tag = $self->SUPER::encode($fs);
    return _conll_2006_to_2009($tag);
}



#------------------------------------------------------------------------------
# Returns reference to list of known tags.
#------------------------------------------------------------------------------
sub list
{
    my $self = shift;
    my $list = $self->SUPER::list();
    my @list = map {_conll_2006_to_2009($_)} (@{$list});
    return \@list;
}



#------------------------------------------------------------------------------
# Converts a CoNLL 2006 Czech tag into the CoNLL 2009 format.
#------------------------------------------------------------------------------
sub _conll_2006_to_2009
{
    my $tag = shift;
    # CoNLL 2006 contains tab-delimited values of the columns CPOS, POS and FEATS.
    # CoNLL 2009 contains tab-delimited values of the columns POS and FEAT (or, possibly of PPOS and PFEAT).
    $tag =~ s/^(\S+)\t(\S+)\t(.*)$/$1\tSubPOS=$2|$3/;
    $tag =~ s/\|_$//;
    # For some reason, CoNLL 2009 data do not set number and person of the word "by" while older data did so.
    # In fact, PDT 2.0 morphology returns "Vc-------------" for this word, and only CoNLL 2006 data had "Num=X|Per=3".
    # CoNLL 2007 data did not have person for "by" but otherwise they can be decoded using the old cs::conll driver.
    if($tag eq "V\tSubPOS=c|Num=X|Per=3")
    {
        $tag = "V\tSubPOS=c";
    }
    return $tag;
}



#------------------------------------------------------------------------------
# Converts a CoNLL 2009 Czech tag into the CoNLL 2006 format.
#------------------------------------------------------------------------------
sub _conll_2009_to_2006
{
    my $tag = shift;
    # CoNLL 2006 contains tab-delimited values of the columns CPOS, POS and FEATS.
    # CoNLL 2009 contains tab-delimited values of the columns POS and FEAT (or, possibly of PPOS and PFEAT).
    my @columns = split(/\t/, $tag);
    if(scalar(@columns)==2)
    {
        my @features0 = split(/\|/, $columns[1]);
        my @features1;
        my $subpos = '_';
        foreach my $f (@features0)
        {
            if($f =~ m/^SubPOS=(.+)$/)
            {
                $subpos = $1;
            }
            else
            {
                push(@features1, $f);
            }
        }
        my $features = scalar(@features1) ? join('|', @features1) : '_';
        $tag = "$columns[0]\t$subpos\t$features";
    }
    # For some reason, CoNLL 2009 data do not set number and person of the word "by" while older data did so.
    # In fact, PDT 2.0 morphology returns "Vc-------------" for this word, and only CoNLL 2006 data had "Num=X|Per=3".
    # CoNLL 2007 data did not have person for "by" but otherwise they can be decoded using the old cs::conll driver.
    if($tag eq "V\tc\t_")
    {
        $tag = "V\tc\tNum=X|Per=3";
    }
    return $tag;
}



1;

__END__

=pod

=encoding UTF-8

=head1 NAME

Lingua::Interset::Tagset::CS::Conll2009 - Driver for the Czech tagset of the CoNLL 2009 Shared Task.

=head1 VERSION

version 2.003

=head1 SYNOPSIS

  use Lingua::Interset::Tagset::CS::Conll2009;
  my $driver = Lingua::Interset::Tagset::CS::Conll2009->new();
  my $fs = $driver->decode("N\tSubPOS=N|Gen=M|Num=S|Cas=1|Neg=A");

or

  use Lingua::Interset qw(decode);
  my $fs = decode('cs::conll2009', "N\tSubPOS=N|Gen=M|Num=S|Cas=1|Neg=A");

=head1 DESCRIPTION

Interset driver for the Czech tagset of the CoNLL 2009 Shared Task.
CoNLL 2009 tagsets in Interset are traditionally two values separated by tabs.
The values come from the CoNLL 2009 columns POS and FEAT. For Czech,
these values are derived from the tagset of the Prague Dependency Treebank; however,
there is an additional surface feature C<Sem>, which is derived from PDT lemmas.
The CoNLL 2009 tagset differs slightly from CoNLL 2006 and 2007:
the (fine-grained) C<POS> column of 2006 and 2007 has been moved to the C<FEAT>
column as a new feature called C<SubPOS>.
This driver is a translation layer above the C<cs::conll> driver.

=head1 SEE ALSO

L<Lingua::Interset>,
L<Lingua::Interset::Tagset>,
L<Lingua::Interset::Tagset::CS::Pdt>,
L<Lingua::Interset::Tagset::CS::Conll>,
L<Lingua::Interset::FeatureStructure>

=head1 AUTHOR

Dan Zeman <zeman@ufal.mff.cuni.cz>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2014 by Univerzita Karlova v Praze (Charles University in Prague).

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut
