/* File: char_freq2.c. Contents: Source code for char_freq command. Author: Jeffrey S. Leon Updated: 1/30/05 The char_freq2 command counts the number of occurences of each two-letter sequence, or digram, in a text file. Upper and lower case letters are treated is equivalent, and characters other than letters are ignored. The frequencies are printed in decreasing order. The syntax for invoking the command is char_freq text_file adjust_to where text_file is the name of the text file. adjust_to (optional) is an integer 2 or greater that indicates that the frequency of each digram is to be scaled so that the frequencies sum to n-1, where n is the value of adjust_to. In particular, if adjust_to == 2, the probability of each digram is shown. If adjust_to is omitted, no scaling is performed. Note: n-1 is used rather than n, because a text with n letters has n-1 digrams. */ #include #include #include typedef int bool; const int false = 0, true = 1; struct { int freq; int letter1; int letter2; } count[26][26]; int compar( const void *f1, const void *f2) { int *n1 = (int *)f1, *n2 = (int *)f2; return (*n1 > *n2) ? -1 : (*n1 < *n2) ? 1 : 0; } int main( int argc, char *argv[]) { int ch1, ch2, n, total = 0, adjust_to = 0; FILE *input; char base = 'A'; char *format; /* Check number of command-line arguments, and process arguments. */ if ( argc != 2 && argc != 3 ) { printf( "Error: Wrong number of command-line arguments"); exit(1); } input = fopen( argv[1], "r"); if ( input == NULL ) { printf( "Error: File %s could not be opened.\n", argv[1]); exit(2); } if ( argc == 3 ) { char *p; adjust_to = strtol( argv[2], &p, 0); if ( p == argv[2] || adjust_to < 2 ) { printf( "Error: Invalid second argument %s.\n", argv[2]); exit(2); } } /* Count number of occurences of each digram (sequence of two letters). */ for ( ch1 = 0 ; ch1 < 26; ++ch1 ) for ( ch2 = 0 ; ch2 < 26; ++ch2 ) { count[ch1][ch2].letter1 = ch1; count[ch1][ch2].letter2 = ch2; count[ch1][ch2].freq = 0; } while ( (ch1 = getc(input)) != EOF && !isalpha(ch1) ) ; if ( ch1 == EOF ) { printf( "Error: No letters in text\n"); exit(3); } if ( islower(ch1) ) { base = 'a'; ++total; } while( (ch2 = getc(input)) != EOF ) { if ( islower(ch2) ) base = 'a'; if ( isalpha(ch2) ) { ++count[tolower(ch1)-'a'][tolower(ch2)-'a'].freq; ++total; ch1 = ch2; } } if ( total == 1 ) { printf( "Error: Only one letter in text\n"); exit(3); } /* Display total number of characters in file. */ printf( "\n%d alphabetic characters in sample.\n\n", total); /* Display frequencies of digrams, adjusted if necessary, and arranged as a matrix */ printf( "In the table below, row x, column y contains the frequency of digram xy.\n\n"); if ( adjust_to != 0 ) printf( "Frequencies are adjusted to %d letters (%d-1 digrams).\n\n", adjust_to, adjust_to-1); if ( adjust_to == 0 ) adjust_to = total; if ( adjust_to == total ) format = " %7.0f"; else if ( adjust_to < 10 ) format = " %7.4f"; else if ( adjust_to < 100 ) format = " %7.3f"; else if ( adjust_to < 1000 ) format = " %7.2f"; else if ( adjust_to < 10000 ) format = " %7.1f"; else format = " %7.0f"; for ( n = 0 ; n <= 13 ; n += 13 ) { printf( " "); for ( ch2 = n ; ch2 < n+13 ; ++ch2 ) printf( " %c", base+ch2); printf("\n\n"); for ( ch1 = 0 ; ch1 < 26 ; ++ch1 ) { printf(" %c ", base+ch1); for ( ch2 = n ; ch2 < n+13 ; ++ch2 ) printf( format, count[ch1][ch2].freq / ((double)total - 1) * ((double)adjust_to - 1) ); printf("\n"); } printf( "\n"); } /* Sort digrams in order of decreasing frequency. */ qsort( count, 26*26, 3*sizeof(int), compar); /* Display the 100 most common digrams, in order of decreasing frequency. */ printf( "\nThe 100 most common digrams are shown below.\n\n"); printf( " Digram Frequency\n" ); if ( adjust_to != total ) printf( " per %d chars\n", adjust_to); n = 0; for ( ch1 = 0; ch1 < 26 && n < 100 ; ++ch1 ) for ( ch2 = 0 ; ch2 < 26 && n < 100 ; ++ch2 ) { printf( " %c%c ", count[ch1][ch2].letter1+base, count[ch1][ch2].letter2+base); printf( format, count[ch1][ch2].freq / ((double)total - 1) * ((double)adjust_to - 1) ); printf( "\n"); ++n; } /* All done. */ fclose( input); return 0; }