Module:Citation/CS1/COinS: Difference between revisions
Content added Content deleted
Vivaporius (talk | contribs) m (1 revision imported) |
en>Trappist the monk (sync from sandbox;) |
||
Line 1: | Line 1: | ||
local coins = {}; |
|||
--[[--------------------------< F O R W A R D D E C L A R A T I O N S >-------------------------------------- |
--[[--------------------------< F O R W A R D D E C L A R A T I O N S >-------------------------------------- |
||
]] |
]] |
||
local is_set, in_array, remove_wiki_link; |
local has_accept_as_written, is_set, in_array, remove_wiki_link, strip_apostrophe_markup; -- functions in Module:Citation/CS1/Utilities |
||
local cfg; -- table of configuration tables that are defined in Module:Citation/CS1/Configuration |
local cfg; -- table of configuration tables that are defined in Module:Citation/CS1/Configuration |
||
⚫ | |||
Strip wiki italic and bold markup from argument so that it doesn't contaminate COinS metadata. |
|||
This function strips common patterns of apostrophe markup. We presume that editors who have taken the time to |
|||
markup a title have, as a result, provided valid markup. When they don't, some single apostrophes are left behind. |
|||
⚫ | |||
local function strip_apostrophe_markup (argument) |
|||
if not is_set (argument) then return argument; end |
|||
if argument:find ( "''", 1, true ) == nil then -- Is there at least one double apostrophe? If not, exit. |
|||
return argument; |
|||
end |
|||
while true do |
|||
if argument:find ( "'''''", 1, true ) then -- bold italic (5) |
|||
argument=argument:gsub("%'%'%'%'%'", ""); -- remove all instances of it |
|||
elseif argument:find ( "''''", 1, true ) then -- italic start and end without content (4) |
|||
argument=argument:gsub("%'%'%'%'", ""); |
|||
elseif argument:find ( "'''", 1, true ) then -- bold (3) |
|||
argument=argument:gsub("%'%'%'", ""); |
|||
elseif argument:find ( "''", 1, true ) then -- italic (2) |
|||
argument=argument:gsub("%'%'", ""); |
|||
else |
|||
break; |
|||
end |
|||
end |
|||
return argument; -- done |
|||
end |
|||
Line 52: | Line 17: | ||
local function make_coins_title (title, script) |
local function make_coins_title (title, script) |
||
title = has_accept_as_written (title); |
|||
if is_set (title) then |
if is_set (title) then |
||
title = strip_apostrophe_markup (title); -- strip any apostrophe markup |
title = strip_apostrophe_markup (title); -- strip any apostrophe markup |
||
else |
else |
||
title=''; -- if not set, make sure title is an empty string |
title = ''; -- if not set, make sure title is an empty string |
||
end |
end |
||
if is_set (script) then |
if is_set (script) then |
||
Line 61: | Line 27: | ||
script = strip_apostrophe_markup (script); -- strip any apostrophe markup |
script = strip_apostrophe_markup (script); -- strip any apostrophe markup |
||
else |
else |
||
script=''; |
script = ''; -- if not set, make sure script is an empty string |
||
end |
end |
||
if is_set (title) and is_set (script) then |
if is_set (title) and is_set (script) then |
||
Line 72: | Line 38: | ||
--[[--------------------------< E S C A P E _ L U A _ M A G I C _ C H A R S >---------------------------------- |
--[[--------------------------< E S C A P E _ L U A _ M A G I C _ C H A R S >---------------------------------- |
||
Returns a string where all of |
Returns a string where all of Lua's magic characters have been escaped. This is important because functions like |
||
string.gsub() treat their pattern and replace strings as patterns, not literal strings. |
string.gsub() treat their pattern and replace strings as patterns, not literal strings. |
||
]] |
]] |
||
Line 78: | Line 44: | ||
local function escape_lua_magic_chars (argument) |
local function escape_lua_magic_chars (argument) |
||
argument = argument:gsub("%%", "%%%%"); -- replace % with %% |
argument = argument:gsub("%%", "%%%%"); -- replace % with %% |
||
argument = argument:gsub("([%^%$%(%)%.%[%]%*%+%-%?])", "%%%1"); -- replace all other |
argument = argument:gsub("([%^%$%(%)%.%[%]%*%+%-%?])", "%%%1"); -- replace all other Lua magic pattern characters |
||
return argument; |
return argument; |
||
end |
end |
||
Line 94: | Line 60: | ||
while true do |
while true do |
||
pattern = pages:match("%[(%w*:?//[^ ]+%s+)[%w%d].*%]"); -- pattern is the opening bracket, the |
pattern = pages:match("%[(%w*:?//[^ ]+%s+)[%w%d].*%]"); -- pattern is the opening bracket, the URL and following space(s): "[url " |
||
if nil == pattern then break; end -- no more |
if nil == pattern then break; end -- no more URLs |
||
pattern = escape_lua_magic_chars (pattern); -- pattern is not a literal string; escape |
pattern = escape_lua_magic_chars (pattern); -- pattern is not a literal string; escape Lua's magic pattern characters |
||
pages = pages:gsub(pattern, ""); -- remove as many instances of pattern as possible |
pages = pages:gsub(pattern, ""); -- remove as many instances of pattern as possible |
||
end |
end |
||
pages = pages:gsub("[%[%]]", ""); -- remove the brackets |
pages = pages:gsub("[%[%]]", ""); -- remove the brackets |
||
pages = pages:gsub("–", "-" ); -- replace endashes with hyphens |
pages = pages:gsub("–", "-" ); -- replace endashes with hyphens |
||
pages = pages:gsub("&%w+;", "-" ); -- and replace |
pages = pages:gsub("&%w+;", "-" ); -- and replace HTML entities (– etc.) with hyphens; do we need to replace numerical entities like   and the like? |
||
return pages; |
return pages; |
||
end |
end |
||
Line 114: | Line 80: | ||
MathML with SVG or PNG fallback |
MathML with SVG or PNG fallback |
||
All three are heavy with |
All three are heavy with HTML and CSS which doesn't belong in the metadata. |
||
Without this function, the metadata saved in the raw wikitext contained the rendering determined by the settings |
Without this function, the metadata saved in the raw wikitext contained the rendering determined by the settings |
||
Line 121: | Line 87: | ||
This function gets the rendered form of an equation according to the editor's preference before the page is saved. It |
This function gets the rendered form of an equation according to the editor's preference before the page is saved. It |
||
then searches the rendering for the text equivalent of the rendered equation and replaces the rendering with that so |
then searches the rendering for the text equivalent of the rendered equation and replaces the rendering with that so |
||
that the page is saved without extraneous |
that the page is saved without extraneous HTML/CSS markup and with a reasonably readable text form of the equation. |
||
When a replacement is made, this function returns true and the value with replacement; otherwise false and the |
When a replacement is made, this function returns true and the value with replacement; otherwise false and the initial |
||
value. To replace multipe equations it is |
value. To replace multipe equations it is necessary to call this function from within a loop. |
||
]=] |
]=] |
||
Line 154: | Line 120: | ||
--[[--------------------------< C O I N S _ C L E A N U P >---------------------------------------------------- |
--[[--------------------------< C O I N S _ C L E A N U P >---------------------------------------------------- |
||
Cleanup parameter values for the metadata by removing or replacing invisible characters and certain |
Cleanup parameter values for the metadata by removing or replacing invisible characters and certain HTML entities. |
||
2015-12-10: there is a bug in mw.text.unstripNoWiki (). It replaces math stripmarkers with the appropriate content |
2015-12-10: there is a bug in mw.text.unstripNoWiki (). It replaces math stripmarkers with the appropriate content |
||
Line 171: | Line 137: | ||
end |
end |
||
value = value:gsub (cfg.stripmarkers['math'], "MATH RENDER ERROR"); |
value = value:gsub (cfg.stripmarkers['math'], "MATH RENDER ERROR"); -- one or more couldn't be replaced; insert vague error message |
||
value = mw.text.unstripNoWiki (value); -- replace nowiki stripmarkers with their content |
value = mw.text.unstripNoWiki (value); -- replace nowiki stripmarkers with their content |
||
Line 177: | Line 143: | ||
value = value:gsub (' ', ' '); -- replace entity with plain space |
value = value:gsub (' ', ' '); -- replace entity with plain space |
||
value = value:gsub ('\226\128\138', ' '); -- replace hair space with plain space |
value = value:gsub ('\226\128\138', ' '); -- replace hair space with plain space |
||
if not mw.ustring.find (value, cfg.indic_script) then -- don't remove zero |
if not mw.ustring.find (value, cfg.indic_script) then -- don't remove zero-width joiner characters from indic script |
||
value = value:gsub ('‍', ''); |
value = value:gsub ('‍', ''); -- remove ‍ entities |
||
value = mw.ustring.gsub (value, '[\226\128\141\226\128\139\194\173]', ''); -- remove zero-width joiner, zero-width space, soft hyphen |
value = mw.ustring.gsub (value, '[\226\128\141\226\128\139\194\173]', ''); -- remove zero-width joiner, zero-width space, soft hyphen |
||
end |
end |
||
value = value:gsub ('[\009\010\013]', ' '); |
value = value:gsub ('[\009\010\013 ]+', ' '); -- replace horizontal tab, line feed, carriage return with plain space |
||
return value; |
return value; |
||
end |
end |
||
Line 214: | Line 180: | ||
}); |
}); |
||
if in_array (class, {'arxiv', 'biorxiv', 'citeseerx', ' |
if in_array (class, {'arxiv', 'biorxiv', 'citeseerx', 'ssrn', 'journal', 'news', 'magazine'}) or |
||
(in_array (class, {'conference', 'interview', 'map', 'press release', 'web'}) and is_set(data.Periodical)) or |
|||
('citation' == class and is_set(data.Periodical) and not is_set (data.Encyclopedia)) then |
('citation' == class and is_set(data.Periodical) and not is_set (data.Encyclopedia)) then |
||
OCinSoutput.rft_val_fmt = "info:ofi/fmt:kev:mtx:journal"; -- journal metadata identifier |
OCinSoutput.rft_val_fmt = "info:ofi/fmt:kev:mtx:journal"; -- journal metadata identifier |
||
if in_array (class, {'arxiv', 'biorxiv', 'citeseerx'}) then |
if in_array (class, {'arxiv', 'biorxiv', 'citeseerx', 'ssrn'}) then -- set genre according to the type of citation template we are rendering |
||
OCinSoutput["rft.genre"] = "preprint"; -- cite arxiv, cite biorxiv, cite citeseerx |
OCinSoutput["rft.genre"] = "preprint"; -- cite arxiv, cite biorxiv, cite citeseerx, cite ssrn |
||
elseif 'conference' == class then |
elseif 'conference' == class then |
||
OCinSoutput["rft.genre"] = "conference"; -- cite conference (when Periodical set) |
OCinSoutput["rft.genre"] = "conference"; -- cite conference (when Periodical set) |
||
Line 231: | Line 198: | ||
-- these used only for periodicals |
-- these used only for periodicals |
||
OCinSoutput["rft.ssn"] = data.Season; -- keywords: winter, spring, summer, fall |
OCinSoutput["rft.ssn"] = data.Season; -- keywords: winter, spring, summer, fall |
||
OCinSoutput["rft.quarter"] = data.Quarter; -- single digits 1->first quarter, etc. |
|||
OCinSoutput["rft.chron"] = data.Chron; -- free-form date components |
OCinSoutput["rft.chron"] = data.Chron; -- free-form date components |
||
OCinSoutput["rft.volume"] = data.Volume; -- does not apply to books |
OCinSoutput["rft.volume"] = data.Volume; -- does not apply to books |
||
OCinSoutput["rft.issue"] = data.Issue; |
OCinSoutput["rft.issue"] = data.Issue; |
||
OCinSoutput['rft.artnum'] = data.ArticleNumber; -- {{cite journal}} only |
|||
OCinSoutput["rft.pages"] = data.Pages; -- also used in book metadata |
OCinSoutput["rft.pages"] = data.Pages; -- also used in book metadata |
||
Line 254: | Line 223: | ||
end |
end |
||
end |
end |
||
else --{'audio-visual', 'AV-media-notes', 'DVD-notes', 'episode', 'interview', 'mailinglist', 'map', 'newsgroup', 'podcast', 'press release', 'serial', 'sign', 'speech', 'web'} |
else -- {'audio-visual', 'AV-media-notes', 'DVD-notes', 'episode', 'interview', 'mailinglist', 'map', 'newsgroup', 'podcast', 'press release', 'serial', 'sign', 'speech', 'web'} |
||
OCinSoutput["rft.genre"] = "unknown"; |
OCinSoutput["rft.genre"] = "unknown"; |
||
end |
end |
||
Line 270: | Line 239: | ||
OCinSoutput['rft.inst'] = data.PublisherName; -- book and dissertation |
OCinSoutput['rft.inst'] = data.PublisherName; -- book and dissertation |
||
end |
end |
||
-- NB. Not currently supported are "info:ofi/fmt:kev:mtx:patent", "info:ofi/fmt:kev:mtx:dc", "info:ofi/fmt:kev:mtx:sch_svc", "info:ofi/fmt:kev:mtx:ctx" |
|||
-- and now common parameters (as much as possible) |
-- and now common parameters (as much as possible) |
||
OCinSoutput["rft.date"] = data.Date; -- book, journal, dissertation |
OCinSoutput["rft.date"] = data.Date; -- book, journal, dissertation |
||
for k, v in pairs( data.ID_list ) do -- what to do about these? For now assume that they are common to all? |
for k, v in pairs( data.ID_list ) do -- what to do about these? For now assume that they are common to all? |
||
-- if k == 'ISBN' then v = clean_isbn( v ) end |
|||
if k == 'ISBN' then v = v:gsub( "[^-0-9X]", "" ); end |
if k == 'ISBN' then v = v:gsub( "[^-0-9X]", "" ); end |
||
local id = cfg.id_handlers[k].COinS; |
local id = cfg.id_handlers[k].COinS; |
||
if string.sub( id or "", 1, 4 ) == 'info' then -- for ids that are in the info:registry |
if string.sub( id or "", 1, 4 ) == 'info' then -- for ids that are in the info:registry |
||
OCinSoutput["rft_id"] = table.concat{ id, "/", v }; |
OCinSoutput["rft_id"] = table.concat{ id, "/", v }; |
||
elseif string.sub (id or "", 1, 3 ) == 'rft' then -- for isbn, issn, eissn, etc that have defined COinS keywords |
elseif string.sub (id or "", 1, 3 ) == 'rft' then -- for isbn, issn, eissn, etc. that have defined COinS keywords |
||
OCinSoutput[ id ] = v; |
OCinSoutput[ id ] = v; |
||
elseif id then |
elseif 'url' == id then -- for urls that are assembled in ~/Identifiers; |asin= and |ol= |
||
OCinSoutput["rft_id"] = table.concat{ cfg.id_handlers[k]. |
OCinSoutput["rft_id"] = table.concat ({data.ID_list[k], "#id-name=", cfg.id_handlers[k].label}); |
||
elseif id then -- when cfg.id_handlers[k].COinS is not nil so urls created here |
|||
OCinSoutput["rft_id"] = table.concat{ cfg.id_handlers[k].prefix, v, cfg.id_handlers[k].suffix or '', "#id-name=", cfg.id_handlers[k].label }; -- others; provide a URL and indicate identifier name as #fragment (human-readable, but transparent to browsers) |
|||
end |
end |
||
end |
end |
||
--[[ |
|||
for k, v in pairs( data.ID_list ) do -- what to do about these? For now assume that they are common to all? |
|||
local id, value = cfg.id_handlers[k].COinS; |
|||
if k == 'ISBN' then value = clean_isbn( v ); else value = v; end |
|||
if string.sub( id or "", 1, 4 ) == 'info' then |
|||
OCinSoutput["rft_id"] = table.concat{ id, "/", v }; |
|||
else |
|||
OCinSoutput[ id ] = value; |
|||
end |
|||
end |
|||
]] |
|||
local last, first; |
local last, first; |
||
for k, v in ipairs( data.Authors ) do |
for k, v in ipairs( data.Authors ) do |
||
last, first = coins_cleanup (v.last), coins_cleanup (v.first or ''); -- replace any nowiki |
last, first = coins_cleanup (v.last), coins_cleanup (v.first or ''); -- replace any nowiki stripmarkers, non-printing or invisible characters |
||
if k == 1 then -- for the first author name only |
if k == 1 then -- for the first author name only |
||
if is_set(last) |
if is_set(last) and is_set(first) then -- set these COinS values if |first= and |last= specify the first author name |
||
OCinSoutput["rft.aulast"] = last; -- book, journal, dissertation |
OCinSoutput["rft.aulast"] = last; -- book, journal, dissertation |
||
OCinSoutput["rft.aufirst"] = first; -- book, journal, dissertation |
OCinSoutput["rft.aufirst"] = first; -- book, journal, dissertation |
||
Line 313: | Line 273: | ||
OCinSoutput["rft.au"] = last; -- book, journal, dissertation |
OCinSoutput["rft.au"] = last; -- book, journal, dissertation |
||
end |
end |
||
-- TODO: At present we do not report "et al.". Add anything special if this condition applies? |
|||
end |
end |
||
end |
end |
||
Line 318: | Line 279: | ||
OCinSoutput.rft_id = data.URL; |
OCinSoutput.rft_id = data.URL; |
||
OCinSoutput.rfr_id = table.concat{ "info:sid/", mw.site.server:match( "[^/]*$" ), ":", data.RawPage }; |
OCinSoutput.rfr_id = table.concat{ "info:sid/", mw.site.server:match( "[^/]*$" ), ":", data.RawPage }; |
||
⚫ | |||
-- TODO: Add optional extra info: |
|||
-- rfr_dat=#REVISION<version> (referrer private data) |
|||
-- ctx_id=<data.RawPage>#<ref> (identifier for the context object) |
|||
-- ctx_tim=<ts> (timestamp in format yyyy-mm-ddThh:mm:ssTZD or yyyy-mm-dd) |
|||
-- ctx_enc=info:ofi/enc:UTF-8 (character encoding) |
|||
⚫ | |||
-- sort with version string always first, and combine. |
-- sort with version string always first, and combine. |
||
--table.sort( OCinSoutput ); |
-- table.sort( OCinSoutput ); |
||
table.insert( OCinSoutput, 1, "ctx_ver=" .. ctx_ver ); |
table.insert( OCinSoutput, 1, "ctx_ver=" .. ctx_ver ); -- such as "Z39.88-2004" |
||
return table.concat(OCinSoutput, "&"); |
return table.concat(OCinSoutput, "&"); |
||
end |
end |
||
Line 336: | Line 304: | ||
cfg = cfg_table_ptr; |
cfg = cfg_table_ptr; |
||
has_accept_as_written = utilities_page_ptr.has_accept_as_written; -- import functions from selected Module:Citation/CS1/Utilities module |
|||
is_set = utilities_page_ptr.is_set; |
|||
in_array = utilities_page_ptr.in_array; |
in_array = utilities_page_ptr.in_array; |
||
remove_wiki_link = utilities_page_ptr.remove_wiki_link; |
remove_wiki_link = utilities_page_ptr.remove_wiki_link; |
||
strip_apostrophe_markup = utilities_page_ptr.strip_apostrophe_markup; |
|||
end |
end |
||
⚫ | |||
⚫ | |||
return { |
return { |