[greenstone-users] GLI client

From Javeria Sharif
DateThu Jul 10 20:11:28 2008
Subject [greenstone-users] GLI client
Skipped content of type multipart/alternative-------------- next part --------------
C:Program FilesGreenstonebinscript>pluginfo.pl -xml HTMLPlug
v-string in use/require non-portable at C:Program FilesGreenstone/perllib/cpan/Image/Size.pm line
20.
<?xml version="1.0" encoding="UTF-8"?>
<PlugInfo>
<Name>HTMLPlug</Name>
<Desc>This plugin processes HTML files</Desc>
<Abstract>no</Abstract>
<Inherits>yes</Inherits>
<Processes>(?i)(.html?|.shtml|.shm|.asp|.phpd?|.cgi|.+?.+=.*)$</Processes>
<Blocks>(?i).(gif|jpe?g|jpe|jpg|png|css)$</Blocks>
<Explodes>no</Explodes>
<Arguments>
<Option>
<Name>process_exp</Name>
<Desc>A perl regular expression to match against filenames. Matching filenames will be process
ed by this plugin. For example, using '(?i).html?$' matches all documents ending in .htm or .html (
case-insensitive).</Desc>
<Type>regexp</Type>
<Default>(?i)(.html?|.shtml|.shm|.asp|.phpd?|.cgi|.+?.+=.*)$</Default>
</Option>
<Option>
<Name>block_exp</Name>
<Desc>Files matching this regular expression will be blocked from being passed to any later pl
ugins in the list. This has no real effect other than to prevent lots of warning messages about inpu
t files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug b
locks any files with .gif, .jpg, .jpeg, .png or .css file extensions.</Desc>
<Type>regexp</Type>
<Default>(?i).(gif|jpe?g|jpe|jpg|png|css)$</Default>
</Option>
<Option>
<Name>nolinks</Name>
<Desc>Don't make any attempt to trap links (setting this flag may improve speed of building/im
porting but any relative links within documents will be broken).</Desc>
<Type>flag</Type>
</Option>
<Option>
<Name>keep_head</Name>
<Desc>Don't remove headers from html files.</Desc>
<Type>flag</Type>
</Option>
<Option>
<Name>no_metadata</Name>
<Desc>Don't attempt to extract any metadata from files.</Desc>
<Type>flag</Type>
</Option>
<Option>
<Name>metadata_fields</Name>
<Desc>Comma separated list of metadata fields to attempt to extract. Use 'tag&amp;lt;tagname&a
mp;gt;' to have the contents of the first &amp;lt;tagname&amp;gt; pair put in a metadata element cal
led 'tagname'. Capitalise this as you want the metadata capitalised in Greenstone, since the tag ext
raction is case insensitive.</Desc>
<Type>string</Type>
<Default>Title</Default>
</Option>
<Option>
<Name>hunt_creator_metadata</Name>
<Desc>Find as much metadata as possible on authorship and place it in the 'Creator' field.</De
sc>
<Type>flag</Type>
</Option>
<Option>
<Name>file_is_url</Name>
<Desc>Set if input filenames make up url of original source documents e.g. if a web mirroring
tool was used to create the import directory structure.</Desc>
<Type>flag</Type>
</Option>
<Option>
<Name>assoc_files</Name>
<Desc>Perl regular expression of file extensions to associate with html documents.</Desc>
<Type>regexp</Type>
<Default>(?i).(gif|jpe?g|jpe|jpg|png|css)$</Default>
</Option>
<Option>
<Name>rename_assoc_files</Name>
<Desc>Renames files associated with documents (e.g. images). Also creates much shallower direc
tory structure (useful when creating collections to go on cd-rom).</Desc>
<Type>flag</Type>
</Option>
<Option>
<Name>title_sub</Name>
<Desc>Substitution expression to modify string stored as Title. Used by, for example, PDFPlug
to remove "Page 1", etc from text used as the title.</Desc>
<Type>string</Type>
<Default></Default>
</Option>
<Option>
<Name>description_tags</Name>
<Desc>Split document into sub-sections where &amp;lt;Section&amp;gt; tags occur. '-keep_head'
will have no effect when this option is set.</Desc>
<Type>flag</Type>
</Option>
<Option>
<Name>w3mir</Name>
<Desc></Desc>
<Type>flag</Type>
<HiddenGLI>yes</HiddenGLI>
</Option>
<Option>
<Name>no_strip_metadata_html</Name>
<Desc>Comma separated list of metadata names, or 'all'. Used with -description_tags, it preven
ts stripping of HTML tags from the values for the specified metadata.</Desc>
<Type>string</Type>
<Required>no</Required>
<Default></Default>
</Option>
<Option>
<Name>sectionalise_using_h_tags</Name>
<Desc>Automatically create a sectioned document using h1, h2, ... hX tags.</Desc>
<Type>flag</Type>
</Option>
<Option>
<Name>tidy_html</Name>
<Desc>If set, converts a HTML document to a well-formed XHTML. It enable users to view the doc
ument in the book format.</Desc>
<Type>flag</Type>
</Option>
<Option>
<Name>old_style_HDL</Name>
<Desc>To mark whether the file in this collection is sectionalized using the old HDL's section
style.</Desc>
<Type>flag</Type>
</Option>
</Arguments>
<PlugInfo>
<Name>BasPlug</Name>
<Desc>Base class for all the import plugins.</Desc>
<Abstract>yes</Abstract>
<Inherits>no</Inherits>
<Processes></Processes>
<Blocks></Blocks>
<Explodes>no</Explodes>
<Arguments>
<Option>
<Name>process_exp</Name>
<Desc>A perl regular expression to match against filenames. Matching filenames will be process
ed by this plugin. For example, using '(?i).html?$' matches all documents ending in .htm or .html (
case-insensitive).</Desc>
<Type>regexp</Type>
<Required>no</Required>
<Default></Default>
</Option>
<Option>
<Name>block_exp</Name>
<Desc>Files matching this regular expression will be blocked from being passed to any later pl
ugins in the list. This has no real effect other than to prevent lots of warning messages about inpu
t files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug b
locks any files with .gif, .jpg, .jpeg, .png or .css file extensions.</Desc>
<Type>regexp</Type>
<Required>no</Required>
<Default></Default>
</Option>
<Option>
<Name>smart_block</Name>
<Desc>Block files in a smarter way than just looking at filenames.</Desc>
<Type>flag</Type>
<Required>no</Required>
</Option>
<Option>
<Name>associate_ext</Name>
<Desc>Causes files with the same root filename as the document being processed by the plugin A
ND a filename extension from the comma separated list provided by this argument to be associated wit
h the document being processed rather than handled as a separate list.</Desc>
<Type>string</Type>
<Required>no</Required>
</Option>
<Option>
<Name>associate_tail_re</Name>
<Desc>{BasPlug.associate_tail_re}</Desc>
<Type>string</Type>
<Required>no</Required>
</Option>
<Option>
<Name>input_encoding</Name>
<Desc>The encoding of the source documents. Documents will be converted from these encodings a
nd stored internally as utf8.</Desc>
<Type>enum</Type>
<Required>no</Required>
<List>
<Value>
<Name>auto</Name>
<Desc>Use text categorization algorithm to automatically identify the encoding of each sou
rce document. This will be slower than explicitly setting the encoding but will work where more than
one encoding is used within the same collection.</Desc>
</Value>
<Value>
<Name>ascii</Name>
<Desc>Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using t
his on a collection of documents that may contain characters outside the plain 7 bit ascii set thoug
h (e.g. German or French documents containing accents), use iso_8859_1 instead.</Desc>
</Value>
<Value>
<Name>utf8</Name>
<Desc>Either utf8 or unicode -- automatically detected.</Desc>
</Value>
<Value>
<Name>unicode</Name>
<Desc>Just unicode.</Desc>
</Value>
<Value>
<Name>iso_8859_6</Name>
<Desc>Arabic</Desc>
</Value>
<Value>
<Name>gb</Name>
<Desc>Chinese Simplified (GB)</Desc>
</Value>
<Value>
<Name>big5</Name>
<Desc>Chinese Traditional (Big5)</Desc>
</Value>
<Value>
<Name>koi8_r</Name>
<Desc>Cyrillic</Desc>
</Value>
<Value>
<Name>iso_8859_5</Name>
<Desc>Cyrillic</Desc>
</Value>
<Value>
<Name>koi8_u</Name>
<Desc>Cyrillic (Ukrainian)</Desc>
</Value>
<Value>
<Name>dos_437</Name>
<Desc>DOS codepage 437 (US English)</Desc>
</Value>
<Value>
<Name>dos_850</Name>
<Desc>DOS codepage 850 (Latin 1)</Desc>
</Value>
<Value>
<Name>dos_852</Name>
<Desc>DOS codepage 852 (Central European)</Desc>
</Value>
<Value>
<Name>dos_866</Name>
<Desc>DOS codepage 866 (Cyrillic)</Desc>
</Value>
<Value>
<Name>iso_8859_7</Name>
<Desc>Greek</Desc>
</Value>
<Value>
<Name>iso_8859_8</Name>
<Desc>Hebrew</Desc>
</Value>
<Value>
<Name>iscii_de</Name>
<Desc>ISCII Devanagari</Desc>
</Value>
<Value>
<Name>euc_jp</Name>
<Desc>Japanese (EUC)</Desc>
</Value>
<Value>
<Name>shift_jis</Name>
<Desc>Japanese (Shift-JIS)</Desc>
</Value>
<Value>
<Name>korean</Name>
<Desc>Korean (Unified Hangul Code - i.e. a superset of EUC-KR)</Desc>
</Value>
<Value>
<Name>iso_8859_1</Name>
<Desc>Latin1 (western languages)</Desc>
</Value>
<Value>
<Name>iso_8859_15</Name>
<Desc>Latin15 (revised western)</Desc>
</Value>
<Value>
<Name>iso_8859_2</Name>
<Desc>Latin2 (central and eastern european languages)</Desc>
</Value>
<Value>
<Name>iso_8859_3</Name>
<Desc>Latin3</Desc>
</Value>
<Value>
<Name>iso_8859_4</Name>
<Desc>Latin4</Desc>
</Value>
<Value>
<Name>iso_8859_9</Name>
<Desc>Turkish</Desc>
</Value>
<Value>
<Name>windows_1250</Name>
<Desc>Windows codepage 1250 (WinLatin2)</Desc>
</Value>
<Value>
<Name>windows_1251</Name>
<Desc>Windows codepage 1251 (WinCyrillic)</Desc>
</Value>
<Value>
<Name>windows_1252</Name>
<Desc>Windows codepage 1252 (WinLatin1)</Desc>
</Value>
<Value>
<Name>windows_1253</Name>
<Desc>Windows codepage 1253 (WinGreek)</Desc>
</Value>
<Value>
<Name>windows_1254</Name>
<Desc>Windows codepage 1254 (WinTurkish)</Desc>
</Value>
<Value>
<Name>windows_1255</Name>
<Desc>Windows codepage 1255 (WinHebrew)</Desc>
</Value>
<Value>
<Name>windows_1256</Name>
<Desc>Windows codepage 1256 (WinArabic)</Desc>
</Value>
<Value>
<Name>windows_1257</Name>
<Desc>Windows codepage 1257 (WinBaltic)</Desc>
</Value>
<Value>
<Name>windows_1258</Name>
<Desc>Windows codepage 1258 (Vietnamese)</Desc>
</Value>
<Value>
<Name>windows_874</Name>
<Desc>Windows codepage 874 (Thai)</Desc>
</Value>
</List>
<Default>auto</Default>
</Option>
<Option>
<Name>default_encoding</Name>
<Desc>Use this encoding if -input_encoding is set to 'auto' and the text categorization algori
thm fails to extract the encoding or extracts an encoding unsupported by Greenstone. This option can
take the same values as -input_encoding.</Desc>
<Type>enum</Type>
<Required>no</Required>
<List>
<Value>
<Name>ascii</Name>
<Desc>Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using t
his on a collection of documents that may contain characters outside the plain 7 bit ascii set thoug
h (e.g. German or French documents containing accents), use iso_8859_1 instead.</Desc>
</Value>
<Value>
<Name>utf8</Name>
<Desc>Either utf8 or unicode -- automatically detected.</Desc>
</Value>
<Value>
<Name>unicode</Name>
<Desc>Just unicode.</Desc>
</Value>
<Value>
<Name>iso_8859_6</Name>
<Desc>Arabic</Desc>
</Value>
<Value>
<Name>gb</Name>
<Desc>Chinese Simplified (GB)</Desc>
</Value>
<Value>
<Name>big5</Name>
<Desc>Chinese Traditional (Big5)</Desc>
</Value>
<Value>
<Name>koi8_r</Name>
<Desc>Cyrillic</Desc>
</Value>
<Value>
<Name>iso_8859_5</Name>
<Desc>Cyrillic</Desc>
</Value>
<Value>
<Name>koi8_u</Name>
<Desc>Cyrillic (Ukrainian)</Desc>
</Value>
<Value>
<Name>dos_437</Name>
<Desc>DOS codepage 437 (US English)</Desc>
</Value>
<Value>
<Name>dos_850</Name>
<Desc>DOS codepage 850 (Latin 1)</Desc>
</Value>
<Value>
<Name>dos_852</Name>
<Desc>DOS codepage 852 (Central European)</Desc>
</Value>
<Value>
<Name>dos_866</Name>
<Desc>DOS codepage 866 (Cyrillic)</Desc>
</Value>
<Value>
<Name>iso_8859_7</Name>
<Desc>Greek</Desc>
</Value>
<Value>
<Name>iso_8859_8</Name>
<Desc>Hebrew</Desc>
</Value>
<Value>
<Name>iscii_de</Name>
<Desc>ISCII Devanagari</Desc>
</Value>
<Value>
<Name>euc_jp</Name>
<Desc>Japanese (EUC)</Desc>
</Value>
<Value>
<Name>shift_jis</Name>
<Desc>Japanese (Shift-JIS)</Desc>
</Value>
<Value>
<Name>korean</Name>
<Desc>Korean (Unified Hangul Code - i.e. a superset of EUC-KR)</Desc>
</Value>
<Value>
<Name>iso_8859_1</Name>
<Desc>Latin1 (western languages)</Desc>
</Value>
<Value>
<Name>iso_8859_15</Name>
<Desc>Latin15 (revised western)</Desc>
</Value>
<Value>
<Name>iso_8859_2</Name>
<Desc>Latin2 (central and eastern european languages)</Desc>
</Value>
<Value>
<Name>iso_8859_3</Name>
<Desc>Latin3</Desc>
</Value>
<Value>
<Name>iso_8859_4</Name>
<Desc>Latin4</Desc>
</Value>
<Value>
<Name>iso_8859_9</Name>
<Desc>Turkish</Desc>
</Value>
<Value>
<Name>windows_1250</Name>
<Desc>Windows codepage 1250 (WinLatin2)</Desc>
</Value>
<Value>
<Name>windows_1251</Name>
<Desc>Windows codepage 1251 (WinCyrillic)</Desc>
</Value>
<Value>
<Name>windows_1252</Name>
<Desc>Windows codepage 1252 (WinLatin1)</Desc>
</Value>
<Value>
<Name>windows_1253</Name>
<Desc>Windows codepage 1253 (WinGreek)</Desc>
</Value>
<Value>
<Name>windows_1254</Name>
<Desc>Windows codepage 1254 (WinTurkish)</Desc>
</Value>
<Value>
<Name>windows_1255</Name>
<Desc>Windows codepage 1255 (WinHebrew)</Desc>
</Value>
<Value>
<Name>windows_1256</Name>
<Desc>Windows codepage 1256 (WinArabic)</Desc>
</Value>
<Value>
<Name>windows_1257</Name>
<Desc>Windows codepage 1257 (WinBaltic)</Desc>
</Value>
<Value>
<Name>windows_1258</Name>
<Desc>Windows codepage 1258 (Vietnamese)</Desc>
</Value>
<Value>
<Name>windows_874</Name>
<Desc>Windows codepage 874 (Thai)</Desc>
</Value>
</List>
<Default>utf8</Default>
</Option>
<Option>
<Name>extract_language</Name>
<Desc>Identify the language of each document and set 'Language' metadata. Note that this will
be done automatically if -input_encoding is 'auto'.</Desc>
<Type>flag</Type>
<Required>no</Required>
</Option>
<Option>
<Name>default_language</Name>
<Desc>If Greenstone fails to work out what language a document is the 'Language' metadata elem
ent will be set to this value. The default is 'en' (ISO 639 language symbols are used: en = English)
. Note that if -input_encoding is not set to 'auto' and -extract_language is not set, all documents
will have their 'Language' metadata set to this value.</Desc>
<Type>string</Type>
<Required>no</Required>
<Default>en</Default>
</Option>
<Option>
<Name>extract_acronyms</Name>
<Desc>Extract acronyms from within text and set as metadata.</Desc>
<Type>flag</Type>
<Required>no</Required>
</Option>
<Option>
<Name>markup_acronyms</Name>
<Desc>Add acronym metadata into document text.</Desc>
<Type>flag</Type>
<Required>no</Required>
</Option>
<Option>
<Name>extract_keyphrases</Name>
<Desc>Extract keyphrases automatically with Kea (default settings).</Desc>
<Type>flag</Type>
<Required>no</Required>
</Option>
<Option>
<Name>extract_keyphrases_kea4</Name>
<Desc>Extract keyphrases automatically with Kea 4.0 (default settings). Kea 4.0 is a new versi
on of Kea that has been developed for controlled indexing of documents in the domain of agriculture.
</Desc>
<Type>flag</Type>
<Required>no</Required>
</Option>
<Option>
<Name>extract_keyphrase_options</Name>
<Desc>Options for keyphrase extraction with Kea. For example: mALIWEB - use ALIWEB extraction
model; n5 - extract 5 keyphrase;, eGBK - use GBK encoding.</Desc>
<Type>string</Type>
<Required>no</Required>
<Default></Default>
</Option>
<Option>
<Name>first</Name>
<Desc>Comma separated list of first sizes to extract from the text into a metadata field. The
field is called 'FirstNNN'.</Desc>
<Type>string</Type>
<Required>no</Required>
</Option>
<Option>
<Name>extract_email</Name>
<Desc>Extract email addresses as metadata.</Desc>
<Type>flag</Type>
<Required>no</Required>
</Option>
<Option>
<Name>extract_historical_years</Name>
<Desc>Extract time-period information from historical documents. This is stored as metadata w
ith the document. There is a search interface for this metadata, which you can include in your colle
ction by adding the statement, "format QueryInterface DateSearch" to your collection configuration f
ile.</Desc>
<Type>flag</Type>
<Required>no</Required>
</Option>
<Option>
<Name>maximum_year</Name>
<Desc>The maximum historical date to be used as metadata (in a Common Era date, such as 1950).
</Desc>
<Type>int</Type>
<Required>no</Required>
<CharactorLength>4</CharactorLength>
<Default>2008</Default>
</Option>
<Option>
<Name>maximum_century</Name>
<Desc>The maximum named century to be extracted as historical metadata (e.g. 14 will extract a
ll references up to the 14th century).</Desc>
<Type>string</Type>
<Required>no</Required>
<Default>-1</Default>
</Option>
<Option>
<Name>no_bibliography</Name>
<Desc>Do not try to block bibliographic dates when extracting historical dates.</Desc>
<Type>flag</Type>
<Required>no</Required>
</Option>
<Option>
<Name>no_cover_image</Name>
<Desc>Do not look for a prefix.jpg file (where prefix is the same prefix as the file being pro
cessed) and associate it as a cover image.</Desc>
<Type>flag</Type>
<Required>no</Required>
</Option>
<Option>
<Name>separate_cjk</Name>
<Desc>Insert spaces between Chinese/Japanese/Korean characters to make each character a word.
Use if text is not segmented.</Desc>
<Type>flag</Type>
<Required>no</Required>
<HiddenGLI>yes</HiddenGLI>
</Option>
<Option>
<Name>new_extract_email</Name>
<Desc></Desc>
<Type>flag</Type>
<Required>no</Required>
<HiddenGLI>yes</HiddenGLI>
</Option>
</Arguments>
</PlugInfo>
</PlugInfo>