#!/usr/bin/perl ####################################################### # # Runs scripts to create out-of-sample performance # datasets for trade direction GLMM. # # Dale W.R. Rosenthal # $Id$ ####################################################### use strict; no strict "subs"; use Math::MatrixReal; my $trades_dir = "/home2/dale/trds"; my $streamnbbo_dir = "/home2/dale/streamnbbo"; my $expected_quotes_dir = "/home2/dale/expbidask"; my $glmm_data_dir = "/home2/dale/r"; my $universe_file = "/home2/dale/sectors.full.csv"; $SIG{CHLD} = "IGNORE"; Log("Starting $0"); LogLineStart("Loading sectors..."); my $sectors = ReadInUniverseAndSectors($universe_file); LogLineAppend("done."); LogLineEnd(); LogLineStart("Loading all trades in our universe..."); my $all_trades = LoadAllTrades($trades_dir, $sectors); LogLineAppend("done."); LogLineEnd(); LogLineStart("Loading EMO and LR quotes..."); my $emolr_quotes = LoadAllEMOandLRquotes($streamnbbo_dir, $sectors); LogLineAppend("done."); LogLineEnd(); # Params: nu, mu, eta3, eta4, tau, betas (intercept, midpoint test, # EMO-ish test w/tau bandwidth, tick test), phis (midpoint, EMO-ish, # tick), and random effect parameters sigma^2_c and sigma^2_d my @paramarray = (0.618,0.784,0,0, # NYSE delay params 1.655,0.352,0,0, # AMEX delay params 1.651,0.330,0,0, # Nasdaq delay params 0.000209, # bid/ask metric param 2.0377,121.93,-20.5081, # NYSE EMO,LR,tick -0.1692,0,0, # NYSE prev EMO,LR,tick 1.2071,0,0, # AMEX EMO,LR,tick 0.3338,0,0, # AMEX prev EMO,LR,tick 1.4054,209.41,29.4486, # Nasdaq EMO,LR,tick -0.1439,0,0 # Nasdaq prev EMO,LR,tick ); LogLineStart("Estimating prevailing bids and asks..."); # pass in nu, mu, eta3, and eta4 for all three markets my $est_quotes = EstimateQuotes($all_trades, $expected_quotes_dir, $streamnbbo_dir, @paramarray[0..11]); LogLineAppend("done."); LogLineEnd(); LogLineStart("Creating dataset..."); # pass in single tau for all markets my $glmm_dataset = CreateOutOfSampleDataset($glmm_data_dir, $all_trades, $est_quotes, $emolr_quotes, $sectors, @paramarray); LogLineAppend("done."); LogLineEnd(); close STDOUT; Log("Stopping $0"); exit(); sub Log { my($message) = @_; my($ss, $mm, $hh, $dd, $mo, $yr) = localtime(); my $timestamp = sprintf("%4d%02d%02d %02d:%02d:%02d", $yr+1900, $mo+1, $dd, $hh, $mm, $ss); print(STDERR "$timestamp: $message\n"); } sub LogLineStart { my($message) = @_; my($ss, $mm, $hh, $dd, $mo, $yr) = localtime(); my $timestamp = sprintf("%4d%02d%02d %02d:%02d:%02d", $yr+1900, $mo+1, $dd, $hh, $mm, $ss); print(STDERR "$timestamp: $message"); } sub LogLineAppend { my($message) = @_; print(STDERR "$message"); } sub LogLineEnd { print(STDERR "\n"); } sub LoadAllTrades { my($trades_dir, $universe) = @_; my $all_trades = {}; opendir(TRADESDIR, $trades_dir) or die "Cannot open trades directory '$trades_dir'!"; my @entries = readdir(TRADESDIR); closedir(TRADESDIR); chomp @entries; # remove trailing newlines ### FOR OUT-OF-SAMPLE TESTIING: load trading days 3--8 my $num_days = 0; foreach my $entry (sort @entries) { ### LIMIT DAYS FOR OUT-OF-SAMPLE TESTING last if $num_days >= 13; if ($entry =~ /^ArcaTrade_\d{8}\.csv$/) { if ($num_days >= 2) { my($date) = ($entry =~ /^ArcaTrade_(\d{8})/); my $these_trades = GetTrades("$trades_dir/$entry", $universe, $date); $all_trades->{$date} = $these_trades; } ### HERE IS INCREMENT FOR A LIMITED NUMBER OF TESTING DAYS $num_days++; } } return $all_trades; } # This gets all trades in our universe from a file # CRUCIAL ASSUMPTION: All trades in the file are from # one trade date, not multiple dates! sub GetTrades { my($trade_file, $universe, $id4logging) = @_; my $trade_list = {}; open(TRADEFILE, $trade_file) or die "Cannot read in trades!"; my $headerline = ; chomp $headerline; my(@tfieldnames) = split /,/, $headerline; my(@tradelines) = ; close TRADEFILE; chomp @tradelines; LogLineAppend("done."); LogLineEnd(); LogLineStart("Creating trades data structure ($id4logging)..."); my @resolution_queue = (); foreach my $tl (@tradelines) { my @tfields = split /,/, $tl; my($hh, $mm, $ss) = split /:/, $tfields[3]; my $t = []; # time, symbol, volume, price, liq provider buy/sell ## date, @$t = ($hh*60*60 + $mm*60 + $ss, @tfields[6, 7, 8, 10]); # 2, next if !exists $universe->{$t->[TRADE_SYMBOL()]}; if (@resolution_queue > 0 and $t->[TRADE_SYMBOL()] eq $resolution_queue[0]->[TRADE_SYMBOL()] and $t->[TRADE_TIME()] == $resolution_queue[0]->[TRADE_TIME()]) { push @resolution_queue, $t; next; } # now handle same-second trades my $ntrades = scalar @resolution_queue; my $i = 1; if (@resolution_queue > 0) { while (my $sst = shift(@resolution_queue)) { $sst->[TRADE_TIME()] += $i/($ntrades+1); push @{$trade_list->{$sst->[TRADE_SYMBOL()]}}, $sst; $i++; } } push @resolution_queue, $t; } # clean out the queue my $ntrades = scalar @resolution_queue; my $i = 1; while (my $sst = shift @resolution_queue) { $sst->[TRADE_TIME()] += $i/($ntrades+1); push @{$trade_list->{$sst->[TRADE_SYMBOL()]}}, $sst; $i++; } return $trade_list; } sub LoadAllEMOandLRquotes { my($quotes_dir, $universe) = @_; my $all_quotes = {}; opendir(QUOTESDIR, $quotes_dir) or die "Cannot open quotes directory '$quotes_dir'!"; my @entries = readdir(QUOTESDIR); closedir(QUOTESDIR); chomp @entries; # remove trailing newlines ### FOR OUT-OF-SAMPLE TESTIING: load trading days 3--8 my $num_days = 0; foreach my $entry (sort @entries) { ### LIMIT DAYS FOR OUT-OF-SAMPLE TESTING last if $num_days >= 13; if ($entry =~ /^quotevectors\.\d{8}\.csv$/) { if ($num_days >= 2) { my($date) = ($entry =~ /^quotevectors\.(\d{8})/); my $these_quotes = GetEMOLRQuotes("$quotes_dir/$entry", $universe, $date); $all_quotes->{$date} = $these_quotes; } ### HERE IS INCREMENT FOR A LIMITED NUMBER OF TESTING DAYS $num_days++; } } return $all_quotes; } # This gets all quotes in our universe from a file # CRUCIAL ASSUMPTION: All quotes in the file are from # one trade date, not multiple dates! sub GetEMOLRQuotes { my($quote_file, $universe, $id4logging) = @_; my $quote_list = {}; open(QUOTEFILE, $quote_file) or die "Cannot read in quotes!"; my $headerline = ; chomp $headerline; my(@qfieldnames) = split /,/, $headerline; my(@quotelines) = ; close QUOTEFILE; chomp @quotelines; LogLineAppend("done."); LogLineEnd(); LogLineStart("Creating quotes data structure ($id4logging)..."); my @resolution_queue = (); foreach my $ql (@quotelines) { my @qfields = split /,/, $ql; my($symbol, $timesecs, $veclength) = @qfields[0, 3, 4]; $timesecs = sprintf("%0.7f", $timesecs); next if !exists $universe->{$symbol}; my $emobid = $qfields[5 + ($veclength+1)*2 - 1]; my $emoask = $qfields[scalar(@qfields) - 1]; # iterate over bid delays my $base = 6; my($lrbid1_idx, $lrbid2_idx) = ($base,$base); for (my $i = 0; $i < $veclength; $i++) { # save index of last active quote in "five-seconds ago" window $lrbid1_idx = $i+$base if $qfields[$i+$base] >= 5; # save index of first active quote in "five-seconds ago" window $lrbid2_idx = $i+$base if $qfields[$i+$base] >= 6; } my($lrbid1) = $qfields[$lrbid1_idx+$veclength+1]; my($lrbid2) = $qfields[$lrbid2_idx+$veclength+1]; # iterate over ask delays $base += 2*($veclength+1)-1; $veclength = $qfields[$base]; $base += 2; my($lrask1_idx, $lrask2_idx) = ($base,$base); for (my $i = 0; $i < $veclength; $i++) { # save index of last active quote in "five-seconds ago" window $lrask1_idx = $i+$base if $qfields[$i+$base] >= 5; # save index of first active quote in "five-seconds ago" window $lrask2_idx = $i+$base if $qfields[$i+$base] >= 6; } my($lrask1) = $qfields[$lrask1_idx+$veclength+1]; my($lrask2) = $qfields[$lrask2_idx+$veclength+1]; my $q = []; # symbol, time, emobid, emoask, lrbid1, lrask1, lrbid2, lrask2 @$q = ($emobid, $emoask, $lrbid1, $lrask1, $lrbid2, $lrask2); # since we made time in seconds unique per symbol and date # there should be no hash clashes $quote_list->{$symbol}{$timesecs} = $q; } return $quote_list; } sub ReadInUniverseAndSectors { my($universe_filename) = @_; my $universe_map = {}; open(UNIVERSEFILE, $universe_filename) or die "Cannot read in universe (from '$universe_filename')!"; my $headerline = ; chomp $headerline; my(@universe_lines) = ; close UNIVERSEFILE; chomp @universe_lines; foreach my $ul (@universe_lines) { my($symbol, $sector, $industry) = split /,/, $ul; $universe_map->{$symbol} = $sector; } return $universe_map; } sub EstimateQuotes { my($all_trades, $estimated_quotes_dir, $streamnbbo_dir, @delay_parameters) = @_; my($nu_nyse, $mu_nyse, $eta3_nyse, $eta4_nyse, $nu_amex, $mu_amex, $eta3_amex, $eta4_amex, $nu_nasdaq, $mu_nasdaq, $eta3_nasdaq, $eta4_nasdaq) = @delay_parameters; my $estimate_quotes = 0; if ($estimate_quotes) { opendir(STREAMLINEDQUOTESDIR, $streamnbbo_dir) or die "Cannot open directory of streamlined quote files!"; my @streamnbbofiles = readdir(STREAMLINEDQUOTESDIR); closedir(STREAMLINEDQUOTESDIR); chomp @streamnbbofiles; # use fork() to distribute (across multiple CPUs on one machine) # the computation of estimates of the prevailing bids and asks. # technically, the call to system() invokes a second fork; but, # I need to do some setup and teardown for the system call. my $parent_pid = $$; my @problemdates = (); foreach my $streamfile (sort @streamnbbofiles) { next if $streamfile !~ /^quotevectors.\d{8}.csv$/; my($date) = ($streamfile =~ /^quotevectors.(\d{8}).csv$/); if (!exists $all_trades->{$date}) { LogLineAppend(" x${date}x"); next; } LogLineAppend(" $date"); my $slot = WaitForProcessingSlot(); ReserveProcessingSlot($slot); my $process_id = fork(); if ($process_id == 0) { # this is the spawned child my $calculate_command = sprintf("/home2/dale/calc_exp_bidask %f %f %f %f %f %f %f %f". " %f %f %f %f %s 2> %s > %s", $nu_nyse, $mu_nyse, $eta3_nyse, $eta4_nyse, $nu_amex, $mu_amex, $eta3_amex, $eta4_amex, $nu_nasdaq, $mu_nasdaq, $eta3_nasdaq, $eta4_nasdaq, "$streamnbbo_dir/$streamfile", "$estimated_quotes_dir/log.oos.$date.log", "$estimated_quotes_dir/estbidask.oos.$date.csv"); my $returncode = system($calculate_command); $returncode &= 0xffff; # get rid of upper bits; not needed by us #push @problemdates, [$date, $returncode] if $returncode == 0; FreeProcessingSlot($slot); exit 0; } } WaitForAllProcessingSlots(); LogLineAppend("done."); LogLineEnd(); if (@problemdates > 0) { LogLineStart("Problems:"); foreach my $prob (@problemdates) { # give date and problematic return code LogLineAppend(" $prob->[0] ($prob->[1])"); } LogLineEnd(); } } # Read in the output files and store the estimated quotes opendir(ESTIMATEDQUOTESDIR, $estimated_quotes_dir) or die "Cannot open directory of estimated quote files!"; my @estbidaskfiles = readdir(ESTIMATEDQUOTESDIR); closedir(ESTIMATEDQUOTESDIR); chomp @estbidaskfiles; my $estimated_bidasks = {}; foreach my $eba_file (@estbidaskfiles) { next if $eba_file !~ /^estbidask.oos.\d{8}.csv$/; open(ESTBIDASKFILE, "$estimated_quotes_dir/$eba_file") or die "Could not open estimated bid ask file '$eba_file'!"; my @eba_lines = ; close ESTBIDASKFILE; chomp @eba_lines; foreach my $line (@eba_lines) { my($sym, $dt, $tsec, $estbid, $estask) = split /,/, $line; $estbid = undef if $estbid eq "nan"; $estask = undef if $estask eq "nan"; # since we made time in seconds unique per symbol and date # there should be no hash clashes $estimated_bidasks->{$dt}{$sym}{$tsec} = [$estbid, $estask]; } } return $estimated_bidasks; } # create out-of-sample dataset by merging trades with estimates # of prevailing quotes and EMO-ish test values. # EMO test yields +1 for trades at prevailing ask # and -1 for trades at prevailing bid. # EMO-ish test gives scaled gaussian to catch trades near # the estimated prevailing bid and ask. Actual value of # EMO-ish test is: # exp(-[(trdprc - estask)/tau]^2) - exp(-[(trdprc - estbid)/tau]^2) sub CreateOutOfSampleDataset { my($dir, $trades, $est_quotes, $emolr_quotes, $sectors, @params) = @_; # The buy_sell field is for the *liquidity providing* # (LP) trade, not for the "initiating" trade. # ('Initiator' as used by Odders-White.) my %lpside_map = (B => 0, # LP buy => Init. sell C => undef, # cross trade X => 1, # LP sell short => Init. buy S => 1); # LP sell long => Init. buy my %sectormnem_map = ("Basic Materials" => "Mat", "Capital Goods" => "CapGd", "Conglomerates" => "Congl", "Consumer Cyclical" => "Cycl", "Consumer Non-Cyclical" => "NCyc", "Energy" => "Enrg", "Financial" => "Finc", "Healthcare" => "Hlth", "Industrial Goods" => "IndGd", "Services" => "Serv", "Technology" => "Tech", "Transportation" => "Trans", "Utilities" => "Util", "" => "NA"); my $missing = "."; my $oos_datafile = "$dir/glmmdataset.oos.csv"; open(DATAFILE, ">$oos_datafile") or die "Cannot open file for out-of-sample dataset!"; my $tau = $params[12]; foreach my $dt (sort {$a <=> $b} keys %$trades) { foreach my $sym (sort keys %{$trades->{$dt}}) { my $market = PrimaryExchange($sym); my $sector = $sectormnem_map{$sectors->{$sym}}; # crucial assumption for the trades loop: trades # are time-ordered with earlier trades first my($prev_t, $prev_nztick); # previous trade, non-zero tick # previous classification metric (nee tests) my($prev_midpt_metric, $prev_tick_metric, $prev_emoish_metric); TRADE: foreach my $t (@{$trades->{$dt}{$sym}}) { $prev_nztick = $prev_t if $t->[TRADE_PRICE()] != $prev_t->[TRADE_PRICE()]; my $tick_metric = ($prev_nztick->[TRADE_PRICE()] > 0) ? log($t->[TRADE_PRICE()]) - log($prev_nztick->[TRADE_PRICE()]) : 0; my $tsec = sprintf("%0.7f", $t->[TRADE_TIME()]); if (!exists $emolr_quotes->{$dt} or !exists $emolr_quotes->{$dt}{$sym} or !exists $emolr_quotes->{$dt}{$sym}{$tsec}) { # big trouble in little china; squawk and skip Log("Failed to match trade ($sym, $dt, $tsec)". " and EMO/LR quotes!"); next TRADE; } if (!exists $est_quotes->{$dt} or !exists $est_quotes->{$dt}{$sym} or !exists $est_quotes->{$dt}{$sym}{$tsec}) { # big trouble in little china; squawk and skip Log("Failed to match trade ($sym, $dt, $tsec)". " and estimated quotes!"); $prev_midpt_metric = 0; $prev_tick_metric = 0; $prev_emoish_metric = 0; $prev_t = $t; next TRADE; } # allocate to a ten-minute bin my $bin = int(($tsec - 9*60*60 - 30*60)/(10*60)); next TRADE if $bin < 4 or $bin > 35; my $dtbin = $dt.":".$bin; my($emo_bid, $emo_ask, $lr1_bid, $lr1_ask, $lr2_bid, $lr2_ask) = @{$emolr_quotes->{$dt}{$sym}{$tsec}}; my($estbid, $estask) = @{$est_quotes->{$dt}{$sym}{$tsec}}; my $estmid = (defined $estbid and defined $estask) ? ($estbid + $estask)/2 : 0; my $midpt_metric = ($estmid > 0) ? log($t->[TRADE_PRICE()]) - log($estmid) : 0; my $emoask_metric = (!defined $estask or $estask <= 0) ? 0 : exp(-((log($t->[TRADE_PRICE()]) - log($estask))/$tau)**2); my $emobid_metric = (!defined $estbid or $estbid <= 0) ? 0 : exp(-((log($t->[TRADE_PRICE()]) - log($estbid))/$tau)**2); my $emoish_metric = ($emoask_metric - $emobid_metric); my @tests = ($emoish_metric, $midpt_metric, $tick_metric, $prev_emoish_metric, $prev_midpt_metric, $prev_tick_metric); my $bs = $lpside_map{$t->[TRADE_LPSIDE()]} if exists $lpside_map{$t->[TRADE_LPSIDE()]}; # fixed effects my $offset = 13; $offset += 6 if $market eq "A"; $offset += 12 if $market eq "T"; my $log_odds = 0; for (my $i = 0; $i < 6; $i++) { $log_odds += $params[$i+$offset]*$tests[$i]; } my $pi = exp($log_odds)/(1 + exp($log_odds)); my $bs_pred = ($pi >= 0.5) ? 1 : 0; # Tick test my $tick = ($tick_metric > 0) ? 1 : 0; # LR test (first quotes in -5 seconds) my $lr1_mid = ($lr1_bid + $lr1_ask)/2; my $lr1 = ($t->[TRADE_PRICE()] > $lr1_mid) ? 1 : ($t->[TRADE_PRICE()] < $lr1_mid) ? 0 : $tick_metric; # LR test (last quotes in -5 seconds) my $lr2_mid = ($lr2_bid + $lr2_ask)/2; my $lr2 = ($t->[TRADE_PRICE()] > $lr2_mid) ? 1 : ($t->[TRADE_PRICE()] < $lr2_mid) ? 0 : $tick; # EMO test my $emo = ($t->[TRADE_PRICE()] == $emo_ask) ? 1 : ($t->[TRADE_PRICE()] == $emo_bid) ? 0 : undef; my $resorted_to_tick = 0; if (!defined $emo) { $emo = $tick; $resorted_to_tick = 1; } # Average contemporaneous spread my $spread = $estask/$estbid - 1; my $outline = join(",", $sym || $missing, $dt || $missing, $market || $missing, $sector || $missing, $tsec || $missing, $dtbin || $missing, $t->[TRADE_VOLUME()] || $missing, $t->[TRADE_PRICE()] || $missing, $bs, $bs_pred, $pi, $tick, $lr1, $lr2, $emo, $spread, $resorted_to_tick); print(DATAFILE "$outline\n"); $prev_midpt_metric = $midpt_metric; $prev_tick_metric = $tick_metric; $prev_emoish_metric = $emoish_metric; $prev_t = $t; } } } close DATAFILE; return $oos_datafile; } # convenience/abstraction functions # sub WaitForProcessingSlot { my $num_slots = 2; # one estimation process per CPU (could do more) my $slot2use; # I hate that this might never terminate -- even though the # style is classic old-school C. Could add a $max_iter variable # but... meh, who cares? Then I'd have to error check the slot # code I return and handle that when I'd probably just kill the # script... which is what I would do for a runaway loop as well. # So the answer is: moot. sleep(1); SLOT_WAIT: while (1) { foreach my $i (1..$num_slots) { # exit while loop if a slot is available if (ProcessingSlotAvailable($i)) { $slot2use = $i; last SLOT_WAIT; } } sleep(10); # wait 10 seconds between checks } return $slot2use; } sub WaitForAllProcessingSlots { my $num_slots = 2; # one estimation per CPU (could do more) - 2 # so as not to step on other processes while (1) { my $all_slots_open = 1; foreach my $i (1..$num_slots) { $all_slots_open &= ProcessingSlotAvailable($i) } last if $all_slots_open; sleep(10); # wait 10 seconds between checks } } sub ProcessingSlotAvailable { my($slotnumber) = @_; return !-e ".slot.$slotnumber"; } sub ReserveProcessingSlot { my($slotnumber) = @_; system("touch .slot.$slotnumber"); } sub FreeProcessingSlot { my($slotnumber) = @_; unlink(".slot.$slotnumber"); } sub Seconds2Time { my($seconds) = @_; my $hh = int($seconds / (60*60)); $seconds -= $hh*60*60; my $mm = int($seconds / 60); $seconds -= $mm*60; return sprintf("$hh:%02d:%02.4f", $mm, $seconds); } sub PrimaryExchange { my($symbol) = @_; my @amex_names = qw(AFP AVD AVN AX BCP BHL BIO BL BMI CAC CAS CHC COI CPD CTO CUB DAR DFC DHB DHC END FIZ GRC GSX GTE GW HH HT HTC IMA IVX KFX LB LGN LNG MIX MLP MSS MWP NBY NHC NHR NVR OHB OMR PDC PGC PRK PRZ PSB RIV SEB SJW STB TBV TDS TIV TKO TMP TPY TWW USM WFD WSC); my @nyse_multiclass = qw(BF.A BF.B CRD.A CRD.B FCE.A FCE.B FSL FSL.B JW.A JW.B KV.A KV.B MOG.A MOG.B NWS NWSA VIA VIA.B SQA.A SQA.B TRX TRX.B TRY TRY.B); my $primary_exchange; if (grep /^$symbol$/, @amex_names) { $primary_exchange = "A"; } elsif (grep /^$symbol$/, @nyse_multiclass) { $primary_exchange = "N"; } elsif (length($symbol) >= 4) { $primary_exchange = "T"; } else { # only non-AMEX 1, 2, and 3-letter symbols left $primary_exchange = "N"; } return $primary_exchange; } sub TRADE_DATE { undef } # field 2 in file sub TRADE_TIME { 0 } # field 3 in file sub TRADE_SYMBOL { 1 } # field 6 in file sub TRADE_VOLUME { 2 } # field 7 in file sub TRADE_PRICE { 3 } # field 8 in file sub TRADE_LPSIDE { 4 } # field 10 in file