Re: Looking for Order in the Subject Hierarchies

From Katherine Don
DateFri, 03 Jan 2003 10:15:00 +1300
Subject Re: Looking for Order in the Subject Hierarchies
In-Reply-To (AF571FCBBF4E4B48A00F66199E4273A67E7537-exchangew2k-mercycorps-org)
hi Pam.

this is a bug in the Hierarchy classifier. I've attached a modified version of the perl module (Hierarchy.pm) which hopefully fixes your problem.
Just put it in gsdl/perllib/classify and rerun the building.
(If you use command line building, you can just run 'buildcol.pl -mode infodb coll-name' to only rebuild the database not the indexes. Then you need to copy building/text/xxx.ldb over to index/text/xxx.ldb)

please let me know if this doesn't fix your problem.

cheers,
Katherine.

Pam Osborne wrote:

> I am using a subject hierarchy classifier for several of my collections and am having a problem with where documents appear in the hierarchy.
>
> This is a portion of the index file:
>
> 2390 2390 "Human Resources
> 2390.030 2390.030 "Benefits
> 2390.120 2390.120 Consultants
> 2390.330 2390.330 Health/Safety
> 2390.420 2390.420 "Interns & Volunteers"
> 2390.450 2390.450 "Mercy Corps Scotland"
> 2390.480 2390.480 "New Hire Packet/Orientation"
> 2390.510 2390.510 "Org Charts/Staff & Location Lists"
> 2390.570 2390.570 "Payroll/Time/Attendance"
>
> This is where two documents are appearing in the middle of the hierarchy:
> *Human Resources
> *Benefits
> *Consultants
> "Philosophy of the Human Resources Department" (document)
> "Who Does What in Human Resources" (document)
> *Health/Safety
> *Interns & Volunteers
> *Mercy Corps Scotland
> *New Hire Packet/Orientation
> *Org Charts/Staff & Location Lists
> *Payroll/Time/Attendance
>
> The documents are assigned the number 2390, and I want them to appear in the list directly under Human Resources. Instead they are appearing in the middle of the list, making it look like they have a number between 2390.120 and 2390.330 and/or making it look like they are "Consultants" documents. We've looked at the example collect.cfg files for some of the NZDL Demo Collections and cannot see where hierarchy placement is defined.
>
> Pam Osborne, Digital Librarian
> Mercy Corps
> 3015 SW First Avenue
> Portland, OR 97201
> Phone: (503)471-2494 Fax: (503)796-6843
> posborne@mercycorps.org


<<attachment>>
Type: text/plain
Filename: Hierarchy.pm

###########################################################################
#
# Hierarchy.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# classifier plugin for generating hierarchical classifications

# options for this classifier are:
# -hfile file.txt - classification file
# -metadata Metaname - metadata field to test against file.txt
# -sort Meta - this option is optional (genious;-). by default this
# classifier will sort documents within each section
# alphabetically by Title. sort=nosort prevents sorting
# (i.e. documents will end up in build order), sort=Meta
# will sort each field alphabetically by Meta (Meta may
# also be 'Filename' to sort by the original filename).
# -buttonname Title - another optional field. this is what will end up in the
# Title field for this classification. if not present it
# defaults to Metaname
# -hlist_at_top - use a horizontal list for the top level (i.e. display it
# like an AZList classification)

# 12/05/02 Added usage datastructure - John Thompson
# 12/05/02 Modified new() so as not to die on error, only on init() - John Thompson

package Hierarchy;

use BasClas;
use util;
use cfgread;
use sorttools;

sub BEGIN {
@ISA = ('BasClas');
}

my $arguments =
[ { 'name' => "metadata",
'desc' => "Metadata field used for classification. List will be sorted by this element.",
'type' => "metadata",
'reqd' => "yes" } ,
{ 'name' => "buttonname",
'desc' => "Button name for this classification. Defaults to metadata name.",
'type' => "string",
'reqd' => "no" } ,
{ 'name' => "hfile",
'desc' => "The classification structure file.",
'type' => "string",
'reqd' => "yes" } ,
{ 'name' => "sort",
'desc' => "Metadata field to sort by (defaults to none).",
'type' => "string",
'reqd' => "no" } ,
{ 'name' => "hlist_at_top",
'desc' => "Display the first level of the classification horizontally.",
'type' => "flag",
'reqd' => "no" }
];

my $options =
{ 'name' => "Hierarchy",
'desc' => "Classifier plugin for generating hierarchical classifications",
'inherits' => "Yes" ,
'args' => $arguments };

sub print_usage {
print STDERR "
usage: classify Hierarchy [options]
options:

-buttonname X Title field for this classification.
Defaults to metadata name.

-metadata X Metadata field used for classification,
list will be sorted by this element,
unless -sort is used.

-hfile X The classification structure file

-sort X Metadata field to sort by (defaults to -metadata)
use '-sort nosort' for no sorting.

-hlist_at_top Display the first level of the classification
horizontally.
";
}


sub new {
my $class = shift (@_);
my $self = new BasClas($class, @_);

# 14-05-02 To allow for proper inheritance of arguments - John Thompson
my $option_list = $self->{'option_list'};
push( @{$option_list}, $options );

my ($hfile, $metadata, $sortname, $title, $hlist_at_top);

if (!parsargv::parse(@_,
q^buttonname/.*/^, $title,
q^sort/.*/^, $sortname,
q^hfile/.*/^, $hfile,
q^metadata/.*/^, $metadata,
q^hlist_at_top^, $hlist_at_top,
"allow_extra_options")) {

$self->{'construction_error'} = "Incorrect options passed to $class, check your collect.cfg file.";
}

if (!$metadata) {
$self->{'construction_error'} = "Hierarchy error: no metadata supplied.";
}

$title = $metadata unless ($title);
# if no sortname specified, it defaults to metadata
$sortname = $metadata unless ($sortname);
$sortname = undef if $sortname =~ /^nosort$/;
my $subjectfile;

if (!$hfile) {
$self->{'construction_error'} = "Hierarchy error: No -hfile supplied.";
}
else
{
$subjectfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $hfile);
if (!-e $subjectfile) {
my $collfile = $subjectfile;
$subjectfile = &util::filename_cat($ENV{'GSDLHOME'},"etc", $hfile);
if (!-e $subjectfile) {
my $outhandle = $self->{'outhandle'};
&print_usage;
print STDERR " Hierarchy Error: Can't locate subject file $hfile ";
print STDERR "This file should be in $collfile or $subjectfile ";
die " ";
}
}
}

$self->{'descriptorlist'} = {}; # first field in subject file
$self->{'locatorlist'} = {}; # second field in subject file
$self->{'subjectfile'} = $subjectfile;
$self->{'metaname'} = $metadata;
$self->{'sortname'} = $sortname;
$self->{'title'} = $title;
$self->{'hlist_at_top'} = $hlist_at_top;

return bless $self, $class;
}

sub init {
my $self = shift (@_);

if(defined $self->{'construction_error'} || !defined $self->{'metaname'} || !defined $self->{'subjectfile'}) {
print STDERR "Error: " , $self->{'construction_error'} , " ";
&print_usage;
die " ";
}

# read in the subject file
my $list = &cfgread::read_cfg_file ($self->{'subjectfile'}, undef, '^[^#]?w');

# $list is a hash that is indexed by the descriptor. The contents of this
# hash is a list of two items. The first item is the OID and the second item
# is the title
foreach $descriptor (keys (%$list)) {
$self->{'descriptorlist'}->{$descriptor} = $list->{$descriptor}->[0];
unless (defined $self->{'locatorlist'}->{$list->{$descriptor}->[0]}) {
$self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'title'} = $list->{$descriptor}->[1];
$self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'contents'} = [];
}
}
}

sub classify {
my $self = shift (@_);
my ($doc_obj) = @_;

my $doc_OID = $doc_obj->get_OID();

my $metadata = $doc_obj->get_metadata ($doc_obj->get_top_section(),
$self->{'metaname'});

my $sortmeta = "";
if (defined $self->{'sortname'}) {
if ($self->{'sortname'} =~ /^filename$/i) {
$sortmeta = $doc_obj->get_source_filename();
} else {
$sortmeta = $doc_obj->get_metadata_element($doc_obj->get_top_section(),
$self->{'sortname'});
if (defined $sortmeta) {
if ($self->{'sortname'} eq "Creator") {
&sorttools::format_string_name_english ($sortmeta);
} else {
&sorttools::format_string_english ($sortmeta);
}
}
}
$sortmeta = "" unless defined $sortmeta;
}

foreach $metaelement (@$metadata) {
if ((defined $self->{'descriptorlist'}->{$metaelement}) &&
(defined $self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}})) {
push (@{$self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}}->{'contents'}},
[$doc_OID, $sortmeta]);
}
}
}

sub get_classify_info {
my $self = shift (@_);

my $list = $self->{'locatorlist'};

my ($classifyinfo);
if ($self->{'hlist_at_top'}) {
$classifyinfo = $self->get_entry ($self->{'title'}, "HList", "Invisible");
} else {
$classifyinfo = $self->get_entry ($self->{'title'}, "VList", "Invisible");
}
# sorted the keys - otherwise funny things happen - kjdon 03/01/03
foreach $OID (sort keys (%$list)) {
my $tempinfo = $self->get_OID_entry ($OID, $classifyinfo, $list->{$OID}->{'title'}, "VList");

if (defined $self->{'sortname'}) {
foreach $subOID (sort {$a->[1] cmp $b->[1];} @{$list->{$OID}->{'contents'}}) {
push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
}
} else {
foreach $subOID (@{$list->{$OID}->{'contents'}}) {
push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
}
}
}

return $classifyinfo;
}

sub get_OID_entry {
my $self = shift (@_);
my ($OID, $classifyinfo, $title, $classifytype) = @_;

$OID = "" unless defined $OID;
$OID =~ s/^.+//;

my ($headOID, $tailOID) = $OID =~ /^(d+)(.*)$/;
$tailOID = "" unless defined $tailOID;

if (!defined $headOID) {
$classifyinfo->{'Title'} = $title;
$classifyinfo->{'classifytype'} = $classifytype;
return $classifyinfo;
}

$classifyinfo->{'contains'} = [] unless defined $classifyinfo->{'contains'};
my $offset = 0;
foreach $thing (@{$classifyinfo->{'contains'}}) {
$offset ++ if defined $thing->{'OID'};
}

while (scalar(@{$classifyinfo->{'contains'}}) < ($headOID+$offset)) {
push (@{$classifyinfo->{'contains'}}, $self->get_entry("", $classifytype));
}

return $self->get_OID_entry ($tailOID, $classifyinfo->{'contains'}->[($headOID+$offset-1)], $title, $classifytype);
}

sub get_entry {
my $self = shift (@_);
my ($title, $childtype, $thistype) = @_;

# organise into classification structure
my %classifyinfo = ('childtype'=>$childtype,
'Title'=>$title,
'contains'=>[]);
$classifyinfo{'thistype'} = $thistype
if defined $thistype && $thistype =~ /w/;

return %classifyinfo;
}


1;