#!/usr/bin/perl ####################################################### # # Creates Arca NBBO dataset for analysis to test side # inference method # # Dale W.R. Rosenthal # $Id: CreateArcaNBBODataset.pl,v 1.1 2008/03/26 19:33:01 dale Exp $ ####################################################### use strict; use Getopt::Long; my($quote_file); GetOptions("quotefile=s" => \$quote_file); die("usage: $0 -quotefile file\n") if !defined $quote_file; Log("Starting $0"); LogLineStart("Reading in Russell 3000 members for 2004H2-2005H1"); open(RUSSELLFILE, "R3000.2004.csv") or die "Cannot read in Russell 3000 names!"; my @r3000_names = ; close RUSSELLFILE; chomp @r3000_names; my %r3000_map = (); @r3000_map{@r3000_names} = (1) x @r3000_names; LogLineEnd(); ReportOutputHeader(); # Require files to be sorted in time order my @months = qw(JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC); my @month_nums = 1..12; my %mon2mm = (); $mon2mm{@months} = @month_nums; my @quote_fields = qw(symbol insd_bid_pr insd_bid_size insd_ask_pr insd_ask_size sequence); my @prev_quote_fields = qw(insd_bid_pr insd_bid_size insd_ask_pr insd_ask_size sequence); my $lastquote = {}; my $lastquote_timeinseconds = 0; my @qtfieldnames = OpenQuoteFileAndReadHeader($quote_file); # initialize same seconds quote queue my @samesecond_quotes = (); while (my $lyne = ) { chomp $lyne; my(@fieldz) = split /,/, $lyne; my $qt = {}; @$qt{@qtfieldnames} = @fieldz; # enqueue same second quote for vetted symbol if (@samesecond_quotes > 0 and $qt->{symbol} eq $samesecond_quotes[0]->{symbol} and $qt->{q_date} eq $samesecond_quotes[0]->{q_date} and $qt->{q_time} eq $samesecond_quotes[0]->{q_time}) { push @samesecond_quotes, $qt; next; } # print out enqueued (same second) quotes my $num_ss_quotes = scalar(@samesecond_quotes); if ($num_ss_quotes > 0) { my $i = 0; my($dd, $mon, $ccyy) = unpack("a2a3a4", $samesecond_quotes[0]->{q_date}); my $ccyymmdd = sprintf("$ccyy%02d$dd", $mon2mm{$mon}); my($hh, $mm, $ss) = split /:/, $samesecond_quotes[0]->{q_time}; while (my $quote = shift @samesecond_quotes) { if ($quote->{q_condition} eq "") { $i++; next; } my $time_in_seconds = ($hh*60 + $mm)*60 + $ss + $i/$num_ss_quotes; print(STDOUT "$ccyymmdd,$time_in_seconds,", join(",", @$quote{@quote_fields})); print(STDOUT ",$lastquote_timeinseconds,", join(",", @$lastquote{@prev_quote_fields}), "\n"); $lastquote = $quote; $lastquote_timeinseconds = $time_in_seconds; $i++; } } if ($lastquote->{symbol} ne $qt->{symbol}) { LogLineEnd(); if (exists $r3000_map{$qt->{symbol}}) { LogLineStart("Processing $qt->{symbol}, $qt->{q_date}: "); } else { LogLineStart("Skipping $qt->{symbol}, $qt->{q_date}: "); } $lastquote = $qt; my @fields2zero = qw(insd_bid_pr insd_bid_size insd_ask_pr insd_ask_size); @$lastquote{@fields2zero} = (0) x @fields2zero; $lastquote_timeinseconds = 0; } next if !exists $r3000_map{$qt->{symbol}}; push @samesecond_quotes, $qt; } close QTFILE; LogLineEnd(); Log("Stopping $0"); exit 0; sub Log { my($message) = @_; my($ss, $mm, $hh, $dd, $mo, $yr) = localtime(); my $timestamp = sprintf("%4d%02d%02d %02d:%02d:%02d", $yr+1900, $mo+1, $dd, $hh, $mm, $ss); print(STDERR "$timestamp: $message\n"); } sub LogLineStart { my($message) = @_; my($ss, $mm, $hh, $dd, $mo, $yr) = localtime(); my $timestamp = sprintf("%4d%02d%02d %02d:%02d:%02d", $yr+1900, $mo+1, $dd, $hh, $mm, $ss); print(STDERR "$timestamp: $message"); } sub LogLineAppend { my($message) = @_; print(STDERR "$message"); } sub LogLineEnd { print(STDERR "\n"); } sub ReportOutputHeader { my $head = join(",", qw(date timeseconds symbol bid bidsz ask asksz seqno prev_timeseconds prev_bid prev_bidsz prev_ask prev_asksz prev_seqno)); print STDOUT "$head\n"; } sub OpenQuoteFileAndReadHeader { my($quote_file) = @_; # Get header info from the quotes file if (!open(QTFILE, $quote_file)) { Log("Cannot open quote file '$quote_file'!"); exit -1; } my $qtheader = ; chomp $qtheader; my @qtfieldnames = split /,/, $qtheader; return @qtfieldnames; } sub PrimaryExchange { my($symbol) = @_; my @amex_names = qw(AFP AVD AVN AX BCP BHL BIO BL BMI CAC CAS CHC COI CPD CTO CUB DAR DFC DHB DHC END FIZ GRC GSX GTE GW HH HT HTC IMA IVX KFX LB LGN LNG MIX MLP MSS MWP NBY NHC NHR NVR OHB OMR PDC PGC PRK PRZ PSB RIV SEB SJW STB TBV TDS TIV TKO TMP TPY TWW USM WFD WSC); my @nyse_multiclass = qw(BF.A BF.B CRD.A CRD.B FCE.A FCE.B FSL FSL.B JW.A JW.B KV.A KV.B MOG.A MOG.B NWS NWSA VIA VIA.B SQA.A SQA.B TRX TRX.B TRY TRY.B); my $primary_exchange; if (grep /^$symbol$/, @amex_names) { $primary_exchange = "A"; } elsif (grep /^$symbol$/, @nyse_multiclass) { $primary_exchange = "N"; } elsif (length($symbol) >= 4) { $primary_exchange = "T"; } else { # only non-AMEX 1, 2, and 3-letter symbols left $primary_exchange = "N"; } return $primary_exchange; } sub MarketOpen { my($date) = @_; # Currently, this is just a stub return "9:30:00"; } sub MarketClose { my($date) = @_; # Currently, this is just a stub return "16:00:00"; } sub Seconds2Time { my($seconds) = @_; my $hh = int($seconds / (60*60)); $seconds -= $hh*60*60; my $mm = int($seconds / 60); $seconds -= $mm*60; return sprintf("$hh:%02d:%02.4f", $mm, $seconds); } sub min { my $min = $_[0]; foreach my $val (@_) { $min = $val if $val < $min; } return $min; } sub max { my $max = $_[0]; foreach my $val (@_) { $max = $val if $val > $max; } return $max; }