package Text::BibTeX::Validate;

use strict;
use warnings;

# ABSTRACT: validator for BibTeX format
our $VERSION = '0.2.0'; # VERSION

use Algorithm::CheckDigits;
use Data::Validate::Email qw( is_email_rfc822 );
use Data::Validate::URI qw( is_uri );
use Scalar::Util qw( blessed );
use Text::BibTeX::Validate::Warning;

require Exporter;
our @ISA = qw( Exporter );
our @EXPORT_OK = qw(
    shorten_DOI
    validate_BibTeX
);

sub shorten_DOI($)
{
    my( $doi ) = @_;

    return $doi if $doi =~ s|^https?://(dx\.)?doi\.org/||;
    return $doi if $doi =~ s|^doi:||;
    return $doi;
}

sub validate_BibTeX
{
    my( $what ) = @_;

    if( blessed $what && $what->isa( 'Text::BibTeX::Entry' ) ) {
        $what = { map { $_ => $what->get($_) } $what->fieldlist };
    }

    # TODO: check for duplicated keys
    my $entry = { map { lc $_ => $what->{$_} } keys %$what };

    # Report and remove empty keys
    for my $key (sort keys %$entry) {
        next if defined $entry->{$key};
        _warn_value( 'undefined value', $entry, $key );
        delete $entry->{$key};
    }

    if( exists $entry->{email} &&
        !defined is_email_rfc822 $entry->{email} ) {
        _warn_value( 'value \'%(value)s\' does not look like valid ' .
                     'email address',
                     $entry,
                     'email' );
    }

    if( exists $entry->{doi} ) {
        my $doi = $entry->{doi};
        my $doi_now = shorten_DOI $doi;

        if( $doi_now !~ m|^10\.[^/]+/| ) {
            _warn_value( 'value \'%(value)s\' does not look like valid DOI',
                         $entry,
                         'doi' );
        } elsif( $doi ne $doi_now ) {
            _warn_value( 'value \'%(value)s\' is better written as \'%(suggestion)s\'',
                         $entry,
                         'doi',
                         { suggestion => $doi_now } );
        }
    }

    # Validated according to BibTeX recommendations
    if( exists $entry->{month} &&
        $entry->{month} !~ /^(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\.?$/i ) {
        _warn_value( 'value \'%(value)s\' does not look like valid month',
                     $entry,
                     'month' );
    }

    if( exists $entry->{year} ) {
        # Sometimes bibliographies list the next year to show that they
        # are going to be published soon.
        my @localtime = localtime;
        if( $entry->{year} !~ /^[0-9]{4}$/ ) {
            _warn_value( 'value \'%(value)s\' does not look like valid year',
                         $entry,
                         'year' );
        } elsif( $entry->{year} > $localtime[5] + 1901 ) {
            _warn_value( 'value \'%(value)s\' is too far in the future',
                         $entry,
                         'year' );
        }
    }

    # Both keys are nonstandard
    for my $key ('isbn', 'issn') {
        next if !exists $entry->{$key};
        my $check = CheckDigits( $key );
        if( $key eq 'isbn' ) {
            my $value = $entry->{$key};
            $value =~ s/-//g;
            if( length $value == 13 ) {
                $check = CheckDigits( 'isbn13' );
            }
        }
        next if $check->is_valid( $entry->{$key} );
        _warn_value( 'value \'%(value)s\' does not look like valid %(FIELD)s',
                     $entry,
                     $key,
                     { FIELD => uc $key } );
    }

    # Both keys are nonstandard
    for my $key ('eprint', 'url') {
        next if !exists $entry->{$key};
        next if defined is_uri $entry->{$key};

        if( $entry->{$key} =~ /^(.*)\n$/ && defined is_uri $1 ) {
            # BibTeX converted from YAML (i.e., Debian::DEP12) might
            # have trailing newline character attached.
            _warn_value( 'URL has trailing newline character',
                         $entry,
                         $key );
            next;
        }

        _warn_value( 'value \'%(value)s\' does not look like valid URL',
                     $entry,
                     $key );
    }

    # Nonstandard
    if( exists $entry->{pmid} ) {
        if( $entry->{pmid} =~ /^PMC[0-9]{7}$/ ) {
            _warn_value( 'PMCID \'%(value)s\' is provided instead of PMID',
                         $entry,
                         'pmid' );
        } elsif( $entry->{pmid} !~ /^[1-9][0-9]*$/ ) {
            _warn_value( 'value \'%(value)s\' does not look like valid PMID',
                         $entry,
                         'pmid' );
        }
    }
}

sub _warn_value
{
    my( $message, $entry, $field, $extra ) = @_;
    $extra = {} unless $extra;
    warn Text::BibTeX::Validate::Warning->new(
            $message,
            { field => $field,
              value => $entry->{$field},
              %$extra } );
}

1;

__END__

=pod

=head1 NAME

Text::BibTeX::Validate - validator for BibTeX format

=head1 SYNOPSIS

    use Text::BibTeX;
    use Text::BibTeX::Validate qw( validate_BibTeX );

    my $bibfile = Text::BibTeX::File->new( 'bibliography.bib' );
    while( my $entry = Text::BibTeX::Entry->new( $bibfile ) ) {
        validate_BibTeX( $entry );
    }

=head1 DESCRIPTION

Text::BibTeX::Validate checks the standard fields of BibTeX entries for
their compliance with their format. In particular, value of C<email> is
checked against RFC 822 mandated email address syntax, value of C<doi>
is checked to start with C<10.> and contain at least one C</> and so on.
Some nonstandard fields as C<isbn>, C<issn> and C<url> are also checked.
Failures of checks are raised as Perl warnings.

Subroutine C<validate_BibTeX> currently accepts plain Perl hash
references containing BibTeX fields and their values, as well as
L<Text::BibTeX::Entry|Text::BibTeX::Entry> objects.

=head1 SEE ALSO

perl(1)

=head1 AUTHORS

Andrius Merkys, E<lt>merkys@cpan.orgE<gt>

=cut
