(use regex posix match sqlite3) (use posix-time apache-util) (use http-utils url) ;; warning: url has a bug which changes + to " " -- shouldn't be done in URL path ;;; Regular expressions (define re:blank-line (regexp "^\\s*$")) (define optional-whitespace "\\s*") (define quoted-field "\"(.*?)\"") (define time-field "\\[(.*?)\\]") (define numeric-field "(\\d+)") (define re:apache-combined (regexp (string-append "^" "(.*?)" " - - " time-field " " quoted-field " " numeric-field " " "(\\d+|-)" " " quoted-field " " quoted-field "$" ) )) ;;; Default configuration #| (define db (open-db database-path)) |# (define database-path "~/db/searches.db") (define (available-logs) (let ((basedir "~/scheme/apache-logs/logs/")) (append (glob (string-append basedir "access_log.1.200*")) (glob (string-append basedir "access_log.200*"))))) ;;; Code (define (sqlite3:begin db) (sqlite3:exec db "BEGIN TRANSACTION;")) (define (sqlite3:end db) (sqlite3:exec db "END TRANSACTION;")) ;; Unfortunately we have no way to test if a table exists -- CREATE TABLE IF NOT EXISTS ;; and DROP TABLE IF EXISTS simply fail due to the step! API. (define (open-db path) (let ((db (sqlite3:open path))) (sqlite3:exec db "create temp table searches_tmp(id integer, time integer, referer text, pretty_name text, term text);") db)) ;; Read in apache log fields from PORT and insert all fields into DB table access_logs. ;; This will generate gigantic database files. ;; DATABASE SCHEMA: ;; create table access_logs(id integer primary key, host text, time integer, ;; request text, status integer, size integer, referer text, useragent text) ; (define (update-db db port) (let ((stmt (sqlite3:prepare db "insert into access_logs values(NULL, ?, ?, ?, ?, ?, ?, ?);"))) (sqlite3:exec db "BEGIN TRANSACTION;") (for-each-line (lambda (line) (cond ((string-search re:blank-line line) 'skip) ((string-search re:apache-combined line) => (match-lambda ((_ host time request status size referer user-agent) (sqlite3:exec stmt host (apache:date->seconds time) request status (if (string->number size) size 0) referer user-agent) ))) (else (warning (conc "invalid logfile line: " line))))) port) (sqlite3:exec db "END TRANSACTION;") (sqlite3:finalize! stmt))) ;; Update search table. ;; DATABASE SCHEMA: ;; create table searches(id integer primary key, time integer, referer text, pretty_name text, term text); ;; Note: convenience functions for bind API don't support NULL values. (define (update-db-searches db port) (let ((stmt (sqlite3:prepare db "insert into searches values(NULL, ?, ?, ?, ?);")) (stmt-no-query (sqlite3:prepare db "insert into searches values(NULL, ?, ?, NULL, NULL);"))) (sqlite3:begin db) (for-each-line (lambda (line) (cond ((string-search re:blank-line line) 'skip) ((string-search re:apache-combined line) => (match-lambda ((_ host time request status size referer user-agent) (let ((u (url referer))) (receive (_ args) (http:decode-url referer) (unless (obviously-bogus-referer u args) (let ((search-term (extract-search-term u args))) (if search-term (sqlite3:exec stmt (apache:date->seconds time) referer (pretty-name (url-host u)) search-term) (sqlite3:exec stmt-no-query (apache:date->seconds time) referer))))))))) (else (warning (conc "invalid logfile line: " line))))) port) (sqlite3:end db) ;; These should be wrapped in dynamic-wind or handle-exceptions. (sqlite3:finalize! stmt) (sqlite3:finalize! stmt-no-query) )) ;; Perform extraction processing on all referer URLs in the selected records returned ;; by search-stmt, and update records upon which search extraction succeeded. ;; This populates a temporary table and then transfers the records over when complete. ;; Returns number of changes that occurred. ;; ;; search-stmt: Returns ID and REFERER columns from result set. ;; e.g. SELECT id, referer FROM searches WHERE term ISNULL [with or without WHERE clause] ;; Requires existing temp table searches_tmp, because we cannot test for existence. (define (rewrite-db-searches/stmt db search-stmt) (sqlite3:exec db "DELETE FROM searches_tmp;") (sqlite3:call-with-temporary-statements (lambda (update-stmt) (sqlite3:begin db) (sqlite3:for-each-row (lambda (id time referer pretty_name _) (let ((u (url referer))) (receive (_ args) (http:decode-url referer) (unless (obviously-bogus-referer u args) (let ((term (extract-search-term u args))) (when term (print term) (sqlite3:exec update-stmt id time referer (pretty-name (url-host u)) term ))))))) search-stmt) (sqlite3:end db)) db "INSERT INTO searches_tmp values(?, ?, ?, ?, ?);") (sqlite3:update db "REPLACE INTO searches SELECT * FROM searches_tmp;")) ;; Obtain all records with NULL search terms and attempt to rewrite them ;; (usually because our extraction algorithm has improved). (define (rewrite-null-db-searches db) (sqlite3:call-with-temporary-statements (lambda (stmt) (rewrite-db-searches/stmt db stmt)) db "SELECT * FROM searches WHERE term ISNULL;")) ;; Passing U (a URL) and ARGS so they are only computed once. (define (obviously-bogus-referer u args) (or (null? args) ; No arguments (not (equal? (url-scheme u) "http")) ; Not an HTTP URL (string-match "[A-za-z0-9._-]*3e8\\.org" (url-host u)))) ; From 3e8.org domain (define (pretty-name hostname) (define (s re) (string-search re hostname)) (cond ((s re:google-images) "Google Images") ((s re:google) "Google") ((s re:yahoo-images) "Yahoo! Images") ((s re:yahoo) "Yahoo!") ((s re:virgilio) "Virgilio") ((s re:altavista) "Altavista") ((s re:ask.com) "ask.com") ((s re:freshmeat) "Freshmeat") ((s re:msn) "MSN") ((s re:mywebsearch) "mywebsearch") (else hostname))) (define re:google-images (regexp "^images.google\\.")) (define re:google (regexp "^(?:www\\.)?google\\.")) (define re:yahoo-images (regexp "^images\\.search\\.yahoo\\.")) (define re:yahoo (regexp "^(?:.*?\\.)?search\\.yahoo\\.")) ;; May have subdomains, e.g. myweb2 (define re:virgilio (regexp "^search\\.virgilio\\.it")) (define re:altavista (regexp "^(?:www\\.)?altavista\\.")) (define re:ask.com (regexp "^(?:www\\.)?ask\\.")) (define re:freshmeat (regexp "^(?:www\\.)?freshmeat\\.")) (define re:ubbi.com (regexp "^(?:www\\.)?ubbi\\.com")) (define re:msn (regexp "^search\\.(?:[^.]\\.)?msn\\.com$")) ;; search.(subdomain.)msn.com (define re:mywebsearch (regexp "^search\\.mywebsearch\\.com$")) ;; Google queries: as_q: all of the words ;; as_epq: exact phrase (phrase quoted) ;; as_eq: without words (negated) ;; as_oq: at least one word (ORed together) (define (extract-search-term url args) (define (args-ref key args) (alist-ref key args string=?)) (define (args-path path args) (if (null? path) args (let ((val (args-ref (car path) args))) (if (null? (cdr path)) val (and val (receive (_ args) (http:decode-url val) (args-path (cdr path) args))))))) (define (negate-words str) ; Prepend "-" to every word in STR. (if (not str) "" (string-join (map (lambda (x) (string-append "-" x)) (string-split str)) " "))) (define (quote-words str) (if (not str) "" (string-append "\"" str "\""))) (define (or-words str) (if (not str) "" (string-join (string-split str) " OR "))) (let ((hostname (url-host url))) (define (s re) (string-search re hostname)) (cond ((s re:google-images) (or (args-path '("prev" "q") args) (args-ref "q" args))) ((s re:google) (cond ((args-ref "as_q" args) => (lambda (as_q) (string-append as_q " " (or-words (args-ref "as_oq" args)) " " (quote-words (args-ref "as_epq" args)) " " (negate-words (args-ref "as_eq" args))))) (else (args-ref "q" args)))) ((s re:yahoo-images) (args-ref "p" args)) ((s re:yahoo) (args-ref "p" args)) ((s re:virgilio) (args-ref "qs" args)) ((s re:altavista) (args-ref "q" args)) ((s re:ask.com) (args-ref "q" args)) ((s re:freshmeat) (args-ref "q" args)) ((s re:ubbi.com) (args-ref "q" args)) ((s re:msn) (args-ref "q" args)) ((s re:mywebsearch) (args-ref "searchfor" args)) (else #f)))) ;; Internal: test extract-search-term and pretty-name. (define (%test-extract str) (let ((u (url str))) (list (pretty-name (url-host u)) (extract-search-term u (receive (_ args) (http:decode-url str) args))))) ; DATABASE SCHEMA: ; create table logged_files(filename text); (define (record-logged-file db pathname) (let ((basename (pathname-strip-directory pathname))) (sqlite3:exec db (string-append "INSERT INTO logged_files values('" basename "');")))) (define (process-files db files) (for-each (lambda (fn) (print "* Processing " fn) (call-with-gzipped-file fn (lambda (fn) (call-with-input-file fn (lambda (prt) (update-db-searches db prt) (record-logged-file db fn)))))) files)) ;; Gunzip fn and call proc with the .gz extension stripped off. ;; If there is no .gz extension, proc is called with the original file. ;; We just warn if the gunzip fails and return an unspecified value. ;; Could error out instead. (define (call-with-gzipped-file fn proc) (define (expand path) (##sys#expand-home-path path)) (if (not (equal? (pathname-extension fn) "gz")) (proc fn) (let ((basename (pathname-strip-extension fn)) (created-file? #f) (ret #f)) (dynamic-wind (lambda () (void)) ; This isn't right; a jump into this context won't work. (lambda () (if (or (file-exists? basename) (not (= (system (sprintf "gunzip -c ~s > ~s" ; 0 byte file might be created!! ;; We're quoting for some safety. ;; Expand tildes manually. (expand fn) (expand basename))) 0))) (warning (sprintf "(~s) failed to gunzip: ~s" 'call-with-gzipped-file fn)) (begin (set! created-file? #t) (proc basename)))) (lambda () (if created-file? ;; Manually expand as workaround for Chicken bug (delete-file (expand basename)))))))) ;; Get a list of all files (basenames) recorded in the logged_files table. (define (get-logged-files db) (sqlite3:map-row identity db "SELECT filename from logged_files;")) ;; Process only new files, filtering out any parsed ones using get-logged-files. ;; XXX FIXME -- we should filter out duplicates as well (after stripping .gz extension off). (define (process-new-files db files) (define re:gz-extension (regexp "\\.gz$")) (define (strip-optional-extension ext fn) (string-substitute re:gz-extension "" fn)) ;; Strip directory and .gz extension when comparing filenames, but return ;; the full filenames. (define (filter-against-existing-files files) (let* ((existing-files (get-logged-files db)) (file-blobs (map (lambda (fn) (cons fn (pathname-strip-directory fn))) files))) (map car (filter (lambda (blob) (not (member (strip-optional-extension "gz" (cdr blob)) existing-files))) file-blobs)))) (process-files db (filter-against-existing-files files))) (define (empty-tables db) (sqlite3:exec db "delete from searches;") (sqlite3:exec db "delete from logged_files;")) #| (process-new-files db (available-logs)) (rewrite-null-db-searches db) (define (up db) (process-files (list "~/scheme/apache-logs/logs/access_log.1.20060201"))) (define (up2 db) (process-files (glob "~/scheme/apache-logs/logs/access_log*"))) (sqlite3:first-result db "select count(*) from raw_searches;") ;; get top 20 referers (raw url) "SELECT referer,count(*) as Number FROM raw_searches GROUP BY referer ORDER BY Number DESC, referer LIMIT 20;" |#