Re: [greenstone-devel] Duplicate meta field in HTML

From Katherine Don
DateWed, 30 Jul 2003 10:50:18 +1200
Subject Re: [greenstone-devel] Duplicate meta field in HTML
In-Reply-To (373af9d9-ab1091b0-816ec00-express-cites-uiuc-edu)
hi Xiao

I have appended a revised version of the extract_metadata method that will extract both (or more) values. you need to replace the existing extract_metadata method with this new one, in perllib/plugins/HTMLPlug.pm

hope this helps,
Katherine Don.

xiaohu@uiuc.edu wrote:

> Hi, dear collegues,
>
> I am building a collection of HTML files, and my files have duplicate meta field like:
>
> <meta name="People" content="James Bond">
> <meta name="People" content="Tom Hanks">
> ....
>
> I did add -metadata fields option of HTMLPlugin in the configuration settings, but the Plugin only processed the first metadata of the duplicate ones. (for the above example, G-stone only plug in "James Bond" .) My G-stone version is 2.38
>
> Did you ever encounter this problem? How did you solve it?
>
> Thank you very much!
>
> Xiao
> Xiao Hu
> *******************************
> Graduate student
> Graduate School of Library and Information Science
> University of Illinois at Urbana-Champaign
> *******************************
>
> _______________________________________________
> greenstone-devel mailing list
> greenstone-devel@list.scms.waikato.ac.nz
> https://list.scms.waikato.ac.nz/mailman/listinfo/greenstone-devel

sub extract_metadata {
my $self = shift (@_);
my ($textref, $metadata, $doc_obj, $section) = @_;
my $outhandle = $self->{'outhandle'};
# if we don't want metadata, we may as well not be here ...
return if (!defined $self->{'metadata_fields'});

# hunt for an author look in the metadata elements:
if (defined $self->{'hunt_creator_metadata'}) {
for my $name (split /,/, "AUTHOR,AUTHOR.EMAIL,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") {
if ($$textref =~ /<meta(s*?)(?:name|http-equiv)s*=s*"?$name"?([^>]*)/is) {
my $content = $1 . $2;
if ($content =~ /contents*=s*"?(.*)"?/is) {
if (defined $1) {
my $value = $1;
$value =~ s/"$//;
$value =~ s/s+/ /gs;
$doc_obj->add_utf8_metadata($section, "Creator", $value);
print $outhandle " extracted Creator metadata "$value" "
if ($self->{'verbosity'} > 2);
next;
}
}
}
}
}

foreach my $field (split /,/, $self->{'metadata_fields'}) {
my $found = 0;
# don't need to extract field if it was passed in from a previous
# (recursive) plugin
next if defined $metadata->{$field};

# see if there's a <meta> tag for this field
while ($$textref =~ /<meta(s*?)(?:name|http-equiv)s*=s*"?$field"?([^>]*)/isg) {
my $content = $1 . $2;
if ($content =~ /contents*=s*"?(.*)"?/is) {
if (defined $1) {
my $value = $1;
$value =~ s/"$//;
$value =~ s/s+/ /gs;
$value =~ s/".*//gs;
$doc_obj->add_utf8_metadata($section, $field, $value);
print $outhandle " extracted "$field" metadata "$value" "
if ($self->{'verbosity'} > 2);
$found = 1;
}
}
}
next if $found;
# TITLE: extract the document title

if ($field =~ /^title$/i) {

# see if there's a <title> tag
if ($$textref =~ /<title[^>]*>([^<]*)</title[^>]*>/is) {
if (defined $1) {
my $title = $1;
if ($title =~ /w/) {
$title =~ s/<[^>]*>/ /g;
$title =~ s/&nbsp;/ /g;
$title =~ s/s+/ /gs;
$title =~ s/^s+//;
$title =~ s/s+$//;
$doc_obj->add_utf8_metadata ($section, $field, $title);
print $outhandle " extracted "$field" metadata "$title" "
if ($self->{'verbosity'} > 2);
next;
}
}
}

# if no title use first 100 characters
my $tmptext = $$textref;
$tmptext =~ s/</([^>]+)><1>//g; # (eg) </b><b> - no space
$tmptext =~ s/<[^>]*>/ /g;
$tmptext =~ s/(?:&nbsp;|xc2xa0)/ /g; # utf-8 for nbsp...
$tmptext =~ s/^s+//s;
$tmptext =~ s/s+$//;
$tmptext =~ s/s+/ /gs;
$tmptext =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
$tmptext =~ s/^s+//s; # in case title_sub introduced any...
$tmptext = substr ($tmptext, 0, 100);
$tmptext =~ s/sS*$/.../;
$doc_obj->add_utf8_metadata ($section, $field, $tmptext);
print $outhandle " extracted "$field" metadata "$tmptext" "
if ($self->{'verbosity'} > 2);
next;
}

# tag: extract the text between the first <H1> and </H1> tags
if ($field =~ /^tag[a-z0-9]+$/i) {

my $tag = $field;
$tag =~ s/^tag//i;
my $tmptext = $$textref;
$tmptext =~ s/s+/ /gs;
if ($tmptext =~ /<$tag[^>]*>/i) {
foreach my $word ($tmptext =~ m/<$tag[^>]*>(.*?)</$tag[^>]*>/g) {
$word =~ s/&nbsp;/ /g;
$word =~ s/<[^>]*>/ /g;
$word =~ s/^s+//;
$word =~ s/s+$//;
$word =~ s/s+/ /gs;
if ($word ne "") {
$doc_obj->add_utf8_metadata ($section, $tag, $word);
print $outhandle " extracted "$tag" metadata "$word" "
if ($self->{'verbosity'} > 2);
}
}
}
next;
}
}
}