#!/usr/bin/env perl =head1 NAME notex.cgi - Network Oriented Transforms in ECMAScript and XML (NOTEX) Transform XML docs with the SpiderMonkey JavaScript engine =head1 VERSION This document refers to version 2.1.1 of notex.cgi, released Feb 24, 2009 =head1 DESCRIPTION notex.cgi transforms XML documents with JavaScript. It is run via CGI by providing a number of query string parameters in the URL, including: * "app": The name of the application running this CGI script * "token": An app token (a filename in the app's directory) * "script": The URL of the JavaScript file to read and run * "jcache": How long (in secs) to cache the JavaScript file (opt) * "cache": How long (in secs) to cache the CGI output (opt) The "script" parameter may be omitted if you're using Apache actions to run this CGI script automatically for JavaScript files with the extension ".jsx". Here is an example Apache configuration file called ".htaccess": ---------------------------- # Run .jsx NOTEX JavaScripts Options ExecCGI AddHandler cgi-script .cgi AddHandler jsx-notex .jsx Action jsx-notex /notex.cgi # Setup the NOTEX environment Setenv NOTEX_STRICT 0 ---------------------------- The JavaScript file may include a number of function calls that are exposed by this CGI program. Here they are included in this example JavaScript code: log('Get my bill details and cache them for 10 mins'); var bill = new XML(GET('mydomain.com/mybill.xml', 600)); // secs var output = { bill.buyer.toString() } { bill.value.toString() } write('', output.toXMLString(), ''); log('Have written my receipts'); This example demonstrates the use of "log()", "GET()" and "write()", plus the E4X processing available in the SpiderMonkey JavaScript engine. Please see: https://developer.mozilla.org/Special:Tags?tag=E4X&language=en for details about processing XML using JavaScript in SpiderMonkey. In addition, a function called "param()" is exposed by this CGI program so that the JavaScript file can be parameterized via the URL query string, for example to read particular XML read files for processing or to change the way an XML data file is processed. Here's an example: var url = param('url'); // get the URL from the query parameter var xml = new XML(GET(url)); // read the URL as XML data xml += { config('app') } write('', xml.toXMLString(), ''); All activity is logged in the application's log directory, and results may be cached for a period by specifying a cache age in seconds in the query. In addition files read with the "GET()" function may also be cached by specifying an age as the second argument like this: "x = GET(url, 600);". HTTP methods "HEAD()", "DELETE()", "POST()" and "PUT()" are also supported (see below for details) but only the "GET()" method supports caching. Each application has a directory in "apps", with 2 writable directories: The log directory is called "logs" and data files are cached in "cache". Be sure to "touch" a token file in each app's directory for security. The filename of the token file must match the query string parameter. =head2 Examples Here are some example query URLs to this CGI script: 1 notex.cgi?app=demo&token=abc&script=mysite.com/mycode.js 2 notex.cgi?app=demo&token=abc&script=mysite.com/mycode.js&jcache=600 3 notex.cgi?app=demo&token=abc&script=mysite.com/mycode.js&cache=600 Version 1 will always read the script "mycode.js" and run it whenever the URL is requested. Version 2 will cache the script "mycode.js" but run it whenever the URL is requested. Version 3 will run the script "mycode.js" one time, then cache the output for any future requests. =cut package NOTEX; $VERSION = '2.1.1'; use strict; use warnings; use constant COOKIE_NOTEX_APP => 'notex_app'; use constant COOKIE_NOTEX_TOKEN => 'notex_token'; use constant DEFAULT_TOKEN_NAME => 'token'; use constant DEFAULT_ENCODING => 'utf-8'; use constant SCRIPT_PATH_MATCH => '^[\w\-\.\\/]+\.jsx?$'; use constant STAT_MOD_TIME => 9; use Time::HiRes qw(time); use Digest::MD5 qw(md5_hex); use LWP::UserAgent; use File::Basename; use CGI qw(:cgi -debug); use JavaScript::SpiderMonkey; # Declare globals our %params; # query string (or PUT/POST) parameters our $app; # the "app" parameter our $token; # the "token" parameter our $script; # the "script" parameter our $remote_host;# the IP address making the CGI request our %config; # various NOTEX configuration settings our $ua; # our user agent when sending requests our $query; # the query string in the requested URL our $status; # the status of our most recent request our @cookies; # a list of cookies to set in requests our %headers; # a hash of headers to set in our response our $bytes_in; # total bytes read into this CGI script our $bytes_out; # total bytes written out of CGI script our $start_time; # the time the CGI request was received our $request_id; # the ID of each request to this script =head2 Functions exposed to your ECMAScript (JavaScript) code =over 4 =item GET(url, [age]) GET a file at a URL and return its contents (specify an age in seconds to cache the file) =cut sub GET # exposed in JavaScript as "GET()" { my ($url, $age) = @_; return '' unless $url; # Optionally, look for a file in the cache if ($age) { my $in = load($url, $age); return $in if $in; } # Read the file over the network my $in = _request(GET => $url); save($url, $in) if $age; return $in; } =item HEAD(url) & DELETE(url) Send a HEAD or DELETE request for a URL =cut # See function_set('HEAD'... below # See function_set('DELETE'... below =item POST(url, content, [content_type]) & PUT(url, content, [content_type]) POST or PUT some content to a URL and return the response contents (the default content type is "application/x-www-form-urlencoded") =cut sub POST_or_PUT # exposed in JavaScript as "POST()" and "PUT()" { my ($method, $url, $out, $content_type) = @_; $content_type ||= 'application/x-www-form-urlencoded'; return _request($method, $url, ['Content-Type' => $content_type], $out); } =item write(content) Write some content as HTTP output using the "header()" and "config()" settings =cut sub write_out # exposed in JavaScript as "write()" { my ($out) = @_; $headers{-type} ||= $config{content_type}; $out = "\n$out" if $out !~ /^\s*<\?xml/ && $headers{-type} eq 'text/xml'; $headers{-cookie} = \@cookies if @cookies; $out = header(%headers) . $out; print $out; $bytes_out += length $out; save($query, $out) if $query; } =item read(filename) Read a filename (providing it's a JavaScript file ending ".js" or ".jsx") =cut sub read_in # exposed in JavaScript as "read()" { my ($file, $match) = @_; # match user files return '' if $match && $file !~ /$match/; $file = _full_path($file) if $match; open (FILE, $file); my $in = join '', ; close FILE; return $in; } =item http(header) & https(header) Return an HTTP(S) header, for example 'Accept-language' or 'User-agent' =cut # See function_set('http'... below # See function_set('https'... below =item method() Return the HTTP method used to call this script (GET, HEAD, PUT, POST or DELETE) =cut # See function_set('method'... below =item status() Return the HTTP status of the most recent HTTP request made by this script =cut # See function_set('status'... below =item param(name, [default]) Return a parameter from the URL query string, or the default value otherwise =cut # See function_set('param'... below =item config(name, [value]) Return a config setting, and optionally set it to a new value: * app: the application name running this CGI script * token: the app token (a filename in the app's directory) * script: the script being run (either a URL or a file path) * remote_host: the remote host name or address running the script * user_agent: by default the user agent is called "NOTEX/2.1.1" * http_timeout: by default the HTTP request timeout is 10 seconds * content_type: defaults to "text/xml" but can be any other type * xml_encoding: defaults to "utf-8" for Unicode but can be other * clean_up_xml: defaults to 1 to remove $name) unless defined $value; push @cookies, cookie(-name => $name, -value => $value, -expires => $expires, -path => $path, -domain => $domain, -secure => $secure); } =item header(name, [value]) Get or set a header (if you only provide the header name, its value is returned) =cut sub headers # exposed in JavaScript as "header()" { my ($name, $value) = @_; $headers{$name} = $value if defined $value; return $headers{$name}; } =item load(url, [age]) Load the contents of a URL from the cache, with an optional age limit =cut sub load # exposed in JavaScript as "load()" { my ($url, $age) = @_; my $file = "apps/$app/cache/" . md5_hex($url); return '' unless -f $file; return '' if $age && $age < int($start_time) - (stat($file))[STAT_MOD_TIME]; return read_in($file); } =item save(url, content) Save the contents of a URL to the cache =cut sub save # exposed in JavaScript as "save()" { my ($url, $out) = @_; return unless defined $out; my $file = md5_hex($url); open (CACHE, ">apps/$app/cache/$file"); print CACHE $out; close CACHE; } =item log(text) Write some text to the app's log file named YYYYMMDD.log in the app's "logs" directory (for example 20090129.log is the log file for Jan 29th, 2009) =cut sub log_event # exposed in JavaScript as "log()" { my ($text) = @_; $text =~ s/&/&/g; $text =~ s//>/g; my $duration = sprintf("%0.3f", time() - $start_time); my ($sec, $min, $hour, $day, $month, $year) = gmtime(); my $date = sprintf("%04d%02d%02d", $year+1900, $month+1, $day); my $clock = sprintf("%02d:%02d:%02d", $hour, $min, $sec); open (LOG, ">>apps/$app/logs/$date.log"); print LOG "$text\n"; close LOG; } =item md5(data, [data]...) Return an MD5 encoding of some data, in hex format =cut # See function_set('md5'... below =item env(variable) Return the value of an environment variable =cut # See function_set('env'... below # ------------------------------------------------- # Private function to request a URL with parameters sub _request { my ($method, $url, $headers, $out) = @_; my $req = new HTTP::Request($method => _full_url($url), $headers, $out); $ua->timeout($config{http_timeout}); # seconds $ua->agent($config{user_agent}); my $res = $ua->request($req); my $in = $res->is_success ? $res->content : ''; $status = $res->status_line; $bytes_in += length $in; $bytes_out += length $out if $out; $in =~ s/<(\?xml|!doctype).+?>//gis if $in =~ /^\s* "NOTEX/$NOTEX::VERSION FastCGI (http://www.notex.info/)", http_timeout => 10, # seconds content_type => 'text/xml', xml_encoding => DEFAULT_ENCODING, clean_up_xml => 1, ); # Get the params with script, app and token %params = $cgi->Vars; $script = $ENV{PATH_TRANSLATED}; # prefer translated paths $script ||= $params{script} unless $ENV{NOTEX_STRICT};# instead of script param $app = $params{app} || cookies(COOKIE_NOTEX_APP); # get app param or cookie $app = $1 if $script =~ m#/apps/([^/]+)#; # but prefer script paths $token = $params{token} || cookies(COOKIE_NOTEX_TOKEN); unless ($script and $app and ($token or $ENV{PATH_TRANSLATED})) { write_out($script ? "Need a token to run script "$script"" : "No script"); exit; } # Add this information to the script config $config{app} = $app; $config{token} = $token ||= DEFAULT_TOKEN_NAME; $config{script} = $script; $config{remote_host} = $remote_host = $cgi->remote_host(); # Create a user agent to read network files $ua = new LWP::UserAgent; $query = ''; $status = ''; @cookies = (); %headers = (); # Measure the cost from running this script $bytes_in = 0; $bytes_out = 0; $start_time = time(); $request_id++; # Check that the app token exists if (!-f "apps/$app/$token") { write_out("App "$app" has no token "$token""); exit; } # Return cached results if requested and available if (my $age = $params{cache}) { $query = query_string(); $query =~ s/\&?j?cache=\d+//g; if (my $out = load($query, $age)) { print $out; $bytes_out += length $out; log_event("Processed $script (cached)"); exit; } } # Not cached, so read or GET the script and optionally cache it my $javascript = $ENV{PATH_TRANSLATED} ? read_in($script) # from file system : GET($script, $params{jcache}); if (!$javascript) { write_out("Cannot read script "$script""); exit; } # Run the script using SpiderMonkey's E4X support my $js = new JavaScript::SpiderMonkey; $js->init(); my $out = ''; $js->function_set('http', sub { return http($_[0]); }); $js->function_set('https', sub { return https($_[0]); }); $js->function_set('write', sub { $out .= join("\n", @_); }); $js->function_set('read', sub { return read_in($_[0], SCRIPT_PATH_MATCH) }); $js->function_set('param', sub { return $params{$_[0]} || $_[1] }); $js->function_set('config', sub { return config(@_); }); $js->function_set('cookie', sub { return cookies(@_); }); $js->function_set('header', sub { return headers(@_); }); $js->function_set('method', sub { return $method; }); $js->function_set('status', sub { return $status; }); $js->function_set('DELETE', sub { return _request(DELETE => $_[0]); }); $js->function_set('HEAD', sub { return _request(HEAD => $_[0]); }); $js->function_set('GET', sub { return GET(@_); }); $js->function_set('PUT', sub { return POST_or_PUT(PUT => @_); }); $js->function_set('POST', sub { return POST_or_PUT(POST => @_); }); $js->function_set('log', sub { log_event('JS: '.join(' ', @_)); }); $js->function_set('md5', sub { return md5_hex(@_); }); $js->function_set('env', sub { return $ENV{$_[0]}; }); $js->function_set('load', sub { return load(@_); }); $js->function_set('save', sub { return save(@_); }); $js->function_set('__err', sub { $out = ''.join(' ', @_).''; }); log_event("Processing $script"); $js->eval("try { $javascript } catch(e) { __err(e); }"); # catch throw() calls write_out($out); log_event("Processed $script"); __END__ =back =head1 DEPENDENCIES Time::HiRes, Digest::MD5, LWP::UserAgent, File::Basename, CGI, JavaScript::SpiderMonkey, and the SpiderMonkey source code from Mozilla to compile and install "libjs". =head1 AUTHOR Kevin Hutchinson =head1 COPYRIGHT Copyright (c) 2009 Legendum, LLC. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.