#!/usr/bin/env perl
=head1 NAME
notex.cgi - Network Oriented Transforms in ECMAScript and XML (NOTEX)
Transform XML docs with the SpiderMonkey JavaScript engine
=head1 VERSION
This document refers to version 2.1.1 of notex.cgi, released Feb 24, 2009
=head1 DESCRIPTION
notex.cgi transforms XML documents with JavaScript. It is run via CGI
by providing a number of query string parameters in the URL, including:
* "app": The name of the application running this CGI script
* "token": An app token (a filename in the app's directory)
* "script": The URL of the JavaScript file to read and run
* "jcache": How long (in secs) to cache the JavaScript file (opt)
* "cache": How long (in secs) to cache the CGI output (opt)
The "script" parameter may be omitted if you're using Apache actions to run
this CGI script automatically for JavaScript files with the extension ".jsx".
Here is an example Apache configuration file called ".htaccess":
----------------------------
# Run .jsx NOTEX JavaScripts
Options ExecCGI
AddHandler cgi-script .cgi
AddHandler jsx-notex .jsx
Action jsx-notex /notex.cgi
# Setup the NOTEX environment
Setenv NOTEX_STRICT 0
----------------------------
The JavaScript file may include a number of function calls that are exposed
by this CGI program. Here they are included in this example JavaScript code:
log('Get my bill details and cache them for 10 mins');
var bill = new XML(GET('mydomain.com/mybill.xml', 600)); // secs
var output =
{ bill.buyer.toString() }
{ bill.value.toString() }
write('', output.toXMLString(), '');
log('Have written my receipts');
This example demonstrates the use of "log()", "GET()" and "write()", plus
the E4X processing available in the SpiderMonkey JavaScript engine.
Please see: https://developer.mozilla.org/Special:Tags?tag=E4X&language=en
for details about processing XML using JavaScript in SpiderMonkey.
In addition, a function called "param()" is exposed by this CGI program so
that the JavaScript file can be parameterized via the URL query string, for
example to read particular XML read files for processing or to change the
way an XML data file is processed. Here's an example:
var url = param('url'); // get the URL from the query parameter
var xml = new XML(GET(url)); // read the URL as XML data
xml += { config('app') }
write('', xml.toXMLString(), '');
All activity is logged in the application's log directory, and results may
be cached for a period by specifying a cache age in seconds in the query.
In addition files read with the "GET()" function may also be cached by
specifying an age as the second argument like this: "x = GET(url, 600);".
HTTP methods "HEAD()", "DELETE()", "POST()" and "PUT()" are also supported
(see below for details) but only the "GET()" method supports caching.
Each application has a directory in "apps", with 2 writable directories:
The log directory is called "logs" and data files are cached in "cache".
Be sure to "touch" a token file in each app's directory for security.
The filename of the token file must match the query string parameter.
=head2 Examples
Here are some example query URLs to this CGI script:
1 notex.cgi?app=demo&token=abc&script=mysite.com/mycode.js
2 notex.cgi?app=demo&token=abc&script=mysite.com/mycode.js&jcache=600
3 notex.cgi?app=demo&token=abc&script=mysite.com/mycode.js&cache=600
Version 1 will always read the script "mycode.js" and run it whenever the
URL is requested. Version 2 will cache the script "mycode.js" but run it
whenever the URL is requested. Version 3 will run the script "mycode.js"
one time, then cache the output for any future requests.
=cut
package NOTEX;
$VERSION = '2.1.1';
use strict;
use warnings;
use constant COOKIE_NOTEX_APP => 'notex_app';
use constant COOKIE_NOTEX_TOKEN => 'notex_token';
use constant DEFAULT_TOKEN_NAME => 'token';
use constant DEFAULT_ENCODING => 'utf-8';
use constant SCRIPT_PATH_MATCH => '^[\w\-\.\\/]+\.jsx?$';
use constant STAT_MOD_TIME => 9;
use Time::HiRes qw(time);
use Digest::MD5 qw(md5_hex);
use LWP::UserAgent;
use File::Basename;
use CGI qw(:cgi -debug);
use JavaScript::SpiderMonkey;
# Declare globals
our %params; # query string (or PUT/POST) parameters
our $app; # the "app" parameter
our $token; # the "token" parameter
our $script; # the "script" parameter
our $remote_host;# the IP address making the CGI request
our %config; # various NOTEX configuration settings
our $ua; # our user agent when sending requests
our $query; # the query string in the requested URL
our $status; # the status of our most recent request
our @cookies; # a list of cookies to set in requests
our %headers; # a hash of headers to set in our response
our $bytes_in; # total bytes read into this CGI script
our $bytes_out; # total bytes written out of CGI script
our $start_time; # the time the CGI request was received
our $request_id; # the ID of each request to this script
=head2 Functions exposed to your ECMAScript (JavaScript) code
=over 4
=item GET(url, [age])
GET a file at a URL and return its contents
(specify an age in seconds to cache the file)
=cut
sub GET # exposed in JavaScript as "GET()"
{
my ($url, $age) = @_;
return '' unless $url;
# Optionally, look for a file in the cache
if ($age)
{
my $in = load($url, $age);
return $in if $in;
}
# Read the file over the network
my $in = _request(GET => $url);
save($url, $in) if $age;
return $in;
}
=item HEAD(url) & DELETE(url)
Send a HEAD or DELETE request for a URL
=cut
# See function_set('HEAD'... below
# See function_set('DELETE'... below
=item POST(url, content, [content_type]) & PUT(url, content, [content_type])
POST or PUT some content to a URL and return the response contents
(the default content type is "application/x-www-form-urlencoded")
=cut
sub POST_or_PUT # exposed in JavaScript as "POST()" and "PUT()"
{
my ($method, $url, $out, $content_type) = @_;
$content_type ||= 'application/x-www-form-urlencoded';
return _request($method, $url, ['Content-Type' => $content_type], $out);
}
=item write(content)
Write some content as HTTP output using the "header()" and "config()" settings
=cut
sub write_out # exposed in JavaScript as "write()"
{
my ($out) = @_;
$headers{-type} ||= $config{content_type};
$out = "\n$out"
if $out !~ /^\s*<\?xml/ && $headers{-type} eq 'text/xml';
$headers{-cookie} = \@cookies if @cookies;
$out = header(%headers) . $out;
print $out;
$bytes_out += length $out;
save($query, $out) if $query;
}
=item read(filename)
Read a filename (providing it's a JavaScript file ending ".js" or ".jsx")
=cut
sub read_in # exposed in JavaScript as "read()"
{
my ($file, $match) = @_; # match user files
return '' if $match && $file !~ /$match/;
$file = _full_path($file) if $match;
open (FILE, $file);
my $in = join '', ;
close FILE;
return $in;
}
=item http(header) & https(header)
Return an HTTP(S) header, for example 'Accept-language' or 'User-agent'
=cut
# See function_set('http'... below
# See function_set('https'... below
=item method()
Return the HTTP method used to call this script (GET, HEAD, PUT, POST or DELETE)
=cut
# See function_set('method'... below
=item status()
Return the HTTP status of the most recent HTTP request made by this script
=cut
# See function_set('status'... below
=item param(name, [default])
Return a parameter from the URL query string, or the default value otherwise
=cut
# See function_set('param'... below
=item config(name, [value])
Return a config setting, and optionally set it to a new value:
* app: the application name running this CGI script
* token: the app token (a filename in the app's directory)
* script: the script being run (either a URL or a file path)
* remote_host: the remote host name or address running the script
* user_agent: by default the user agent is called "NOTEX/2.1.1"
* http_timeout: by default the HTTP request timeout is 10 seconds
* content_type: defaults to "text/xml" but can be any other type
* xml_encoding: defaults to "utf-8" for Unicode but can be other
* clean_up_xml: defaults to 1 to remove $name) unless defined $value;
push @cookies, cookie(-name => $name, -value => $value, -expires => $expires, -path => $path, -domain => $domain, -secure => $secure);
}
=item header(name, [value])
Get or set a header (if you only provide the header name, its value is returned)
=cut
sub headers # exposed in JavaScript as "header()"
{
my ($name, $value) = @_;
$headers{$name} = $value if defined $value;
return $headers{$name};
}
=item load(url, [age])
Load the contents of a URL from the cache, with an optional age limit
=cut
sub load # exposed in JavaScript as "load()"
{
my ($url, $age) = @_;
my $file = "apps/$app/cache/" . md5_hex($url);
return '' unless -f $file;
return '' if $age && $age < int($start_time) - (stat($file))[STAT_MOD_TIME];
return read_in($file);
}
=item save(url, content)
Save the contents of a URL to the cache
=cut
sub save # exposed in JavaScript as "save()"
{
my ($url, $out) = @_;
return unless defined $out;
my $file = md5_hex($url);
open (CACHE, ">apps/$app/cache/$file");
print CACHE $out;
close CACHE;
}
=item log(text)
Write some text to the app's log file named YYYYMMDD.log in the app's "logs"
directory (for example 20090129.log is the log file for Jan 29th, 2009)
=cut
sub log_event # exposed in JavaScript as "log()"
{
my ($text) = @_;
$text =~ s/&/&/g; $text =~ s/</g; $text =~ s/>/>/g;
my $duration = sprintf("%0.3f", time() - $start_time);
my ($sec, $min, $hour, $day, $month, $year) = gmtime();
my $date = sprintf("%04d%02d%02d", $year+1900, $month+1, $day);
my $clock = sprintf("%02d:%02d:%02d", $hour, $min, $sec);
open (LOG, ">>apps/$app/logs/$date.log");
print LOG "$text\n";
close LOG;
}
=item md5(data, [data]...)
Return an MD5 encoding of some data, in hex format
=cut
# See function_set('md5'... below
=item env(variable)
Return the value of an environment variable
=cut
# See function_set('env'... below
# -------------------------------------------------
# Private function to request a URL with parameters
sub _request
{
my ($method, $url, $headers, $out) = @_;
my $req = new HTTP::Request($method => _full_url($url), $headers, $out);
$ua->timeout($config{http_timeout}); # seconds
$ua->agent($config{user_agent});
my $res = $ua->request($req);
my $in = $res->is_success ? $res->content : '';
$status = $res->status_line;
$bytes_in += length $in;
$bytes_out += length $out if $out;
$in =~ s/<(\?xml|!doctype).+?>//gis if $in =~ /^\s* && $config{clean_up_xml};
return $in;
}
# Private function to make a relative URL absolute
sub _full_url
{
my ($url) = @_;
return $url if $url =~ m#^https?://#i;
if ($url !~ m#^/#)
{
my $dir = dirname($ENV{PATH_INFO} || '');
$url = ($dir eq '.' ? '/' : "$dir/") . $url;
}
return ($ENV{HTTPS} ? 'https' : 'http') . "://$ENV{HTTP_HOST}$url";
}
# Private function to make a relative path absolute
sub _full_path
{
my ($file) = @_;
if ($file !~ s#^/##)
{
my $dir = dirname($ENV{PATH_TRANSLATED} || '');
$file = "$dir/$file";
}
return $file;
}
# -----------------------------------------
# Receive and process a single CGI request
my $method = $ENV{REQUEST_METHOD};
$ENV{REQUEST_METHOD} =~ s/DELETE/GET/; # for CGI.pm
my $cgi = new CGI; $ENV{REQUEST_METHOD} = $method;
# Set the CGI script configuration defaults
%config = (
user_agent => "NOTEX/$NOTEX::VERSION FastCGI (http://www.notex.info/)",
http_timeout => 10, # seconds
content_type => 'text/xml',
xml_encoding => DEFAULT_ENCODING,
clean_up_xml => 1,
);
# Get the params with script, app and token
%params = $cgi->Vars;
$script = $ENV{PATH_TRANSLATED}; # prefer translated paths
$script ||= $params{script} unless $ENV{NOTEX_STRICT};# instead of script param
$app = $params{app} || cookies(COOKIE_NOTEX_APP); # get app param or cookie
$app = $1 if $script =~ m#/apps/([^/]+)#; # but prefer script paths
$token = $params{token} || cookies(COOKIE_NOTEX_TOKEN);
unless ($script and $app and ($token or $ENV{PATH_TRANSLATED}))
{
write_out($script ?
"Need a token to run script "$script"" :
"No script");
exit;
}
# Add this information to the script config
$config{app} = $app;
$config{token} = $token ||= DEFAULT_TOKEN_NAME;
$config{script} = $script;
$config{remote_host} = $remote_host = $cgi->remote_host();
# Create a user agent to read network files
$ua = new LWP::UserAgent;
$query = '';
$status = '';
@cookies = ();
%headers = ();
# Measure the cost from running this script
$bytes_in = 0;
$bytes_out = 0;
$start_time = time();
$request_id++;
# Check that the app token exists
if (!-f "apps/$app/$token")
{
write_out("App "$app" has no token "$token"");
exit;
}
# Return cached results if requested and available
if (my $age = $params{cache})
{
$query = query_string();
$query =~ s/\&?j?cache=\d+//g;
if (my $out = load($query, $age))
{
print $out;
$bytes_out += length $out;
log_event("Processed $script (cached)");
exit;
}
}
# Not cached, so read or GET the script and optionally cache it
my $javascript = $ENV{PATH_TRANSLATED} ? read_in($script) # from file system
: GET($script, $params{jcache});
if (!$javascript)
{
write_out("Cannot read script "$script"");
exit;
}
# Run the script using SpiderMonkey's E4X support
my $js = new JavaScript::SpiderMonkey;
$js->init();
my $out = '';
$js->function_set('http', sub { return http($_[0]); });
$js->function_set('https', sub { return https($_[0]); });
$js->function_set('write', sub { $out .= join("\n", @_); });
$js->function_set('read', sub { return read_in($_[0], SCRIPT_PATH_MATCH) });
$js->function_set('param', sub { return $params{$_[0]} || $_[1] });
$js->function_set('config', sub { return config(@_); });
$js->function_set('cookie', sub { return cookies(@_); });
$js->function_set('header', sub { return headers(@_); });
$js->function_set('method', sub { return $method; });
$js->function_set('status', sub { return $status; });
$js->function_set('DELETE', sub { return _request(DELETE => $_[0]); });
$js->function_set('HEAD', sub { return _request(HEAD => $_[0]); });
$js->function_set('GET', sub { return GET(@_); });
$js->function_set('PUT', sub { return POST_or_PUT(PUT => @_); });
$js->function_set('POST', sub { return POST_or_PUT(POST => @_); });
$js->function_set('log', sub { log_event('JS: '.join(' ', @_)); });
$js->function_set('md5', sub { return md5_hex(@_); });
$js->function_set('env', sub { return $ENV{$_[0]}; });
$js->function_set('load', sub { return load(@_); });
$js->function_set('save', sub { return save(@_); });
$js->function_set('__err', sub { $out = ''.join(' ', @_).''; });
log_event("Processing $script");
$js->eval("try { $javascript } catch(e) { __err(e); }"); # catch throw() calls
write_out($out);
log_event("Processed $script");
__END__
=back
=head1 DEPENDENCIES
Time::HiRes, Digest::MD5, LWP::UserAgent, File::Basename, CGI,
JavaScript::SpiderMonkey, and the SpiderMonkey source code from Mozilla
to compile and install "libjs".
=head1 AUTHOR
Kevin Hutchinson
=head1 COPYRIGHT
Copyright (c) 2009 Legendum, LLC.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 3
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.