#!/usr/bin/perl

#
# A script to calculate bitsquats from a list of FQDNs
# Input: A list of FQDNs
# Output: A list of 1-bit variants 
# Author: Jaeson Schultz, Threat Research Engineer
# Date: July 23, 2013
#
# The MIT License (MIT)
#
# Copyright (c) 2013 Cisco Systems, Inc.
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#

use strict;
use warnings;
use Getopt::Std;
use Tie::IxHash;

# Declare local variables
my ( %options,
     %valid_tlds,
     %need_third_level,
     %valid_second_level,
     $FQDN,
     @o_squats,
     @n_squats,
     @c_squats,
     @bit_squats,
     %found,
     %variants,
     $name
);

sub trim_FQDN($) {
    # Declare vars
    my ( @hostnames,
         $domain,
         $dots,
         $x
    );

    # Subroutine to shorten a Fully Qualified Domain Name down to its basic form,
    # which means eliminating any extraneous hostnames. Will return the registrable
    # form of a domain, or an empty string if none is found.

    $domain = shift @_;
    chomp($domain);

    # For bitsquats we just need to have the base domain
    # This is either domain.TLD or host.domain.TLD.
    # So first grab the TLD and see if its valid
    $dots = 1;
    @hostnames = split /\./, $domain;
    return('') unless ( $#hostnames > 0 );
    @hostnames = reverse @hostnames;
    return('')  unless ( $valid_tlds{$hostnames[0]} );

    # Depending on the second level domain, we may up the 
    # number of hostnames to include in our return value
    if ( $#hostnames > 1 ) {
        $dots += 1 if ( ( scalar grep /^$hostnames[1]$/, @{$valid_second_level{$hostnames[0]}} ) || $need_third_level{$hostnames[0]} );
    }

    # Now rebuild domain
    $domain = $hostnames[0];
    for ($x=1; $x <= $dots; $x++) {
        $domain = $hostnames[$x] . "." . $domain;
    }
    return($domain);
}

sub extract_o_squats($) {
    # Declare vars
    my ( $domain,
         @found,
         $result,
         $offset,
         $test_string
    );

    # Subroutine to analyze a domain name for the presence of bitsquat domains caused by
    # the letter 'o' flipping one bit to become a slash '/'.  Because these can happen
    # anywhere, including at the subdomain level, we check the entire FQDN starting from 
    # the leftmost name. This sub also looks for bare domains to either add or subtract 
    # a letter 'o' from, for example http://oreilly.com/ can flip one bit and become
    # http:///reilly.com.  While this is invalid syntax, it would most likely be fixed
    # by the browser, and direct traffic to 'reilly.com'.

    $domain = shift @_;
    chomp($domain);
    $offset = 0;

    # Loop through the string starting from the left looking for the letter 'o' inside the domain
    $result = index($domain, 'o', $offset);
    while ( $result != -1 ) {
        # We have a letter o present, lets form a substring up to this point
        $test_string = substr($domain,0,$result);
        $offset = $result+1;
        $result = index($domain, 'o', $offset);
        # Check for validity
        next unless &is_valid_domain($test_string);
        push(@found,$test_string);
    }

    # Look for the case of a naked domain name where we can add the letter 'o'
    if ( $domain =~ /^[a-z0-9][a-z0-9\-]*[a-z0-9]\.[a-z0-9\-]+$/ || $domain =~ /^[a-z0-9][a-z0-9\-]*[a-z0-9]\.[a-z]+\.[a-z][a-z]$/ ) {
        # It fits the general format lets see if it is valid
        $test_string = 'o' . $domain;
        $test_string = &trim_FQDN($test_string);
        push(@found,$test_string) if ( &is_valid_domain($test_string) && $test_string =~ /^o/ );
    }

    # Look for the case of a named domain name that begins with 'o' in case it loses it
    if ( $domain =~ /^o([a-z0-9\-]*[a-z0-9]\.[a-z0-9\-]+)$/ || $domain =~ /^o([a-z0-9\-]*[a-z0-9]\.[a-z0-9][a-z0-9\-]*[a-z0-9]\.[a-z][a-z])$/ ) {
        # It fits the format, lets see if its valid
        $test_string = $1;
        push(@found,$test_string) if ( &is_valid_domain($test_string) );
    }

    return(@found);
}

sub extract_n_squats($) {
    # Declare vars
    my ( $domain,
         @found,
         $result,
         $result_dot,
         $next_dot,
         $offset,
         $test_string
    );

    # Subroutine to process a domain name looking for bitsquats based on the letter 'n'
    # flipping a bit to become a dot '.' and vice-versa.  We  may conditionally include 
    # certain content depending on the form the n-based bitsquat takes.  For example the 
    # domain 'zinnco.in' can flip one bit to become 'zin.co.in' or 'zi.nco.in'.  The former 
    # is a registrable domain under 'co.in', but the latter is actually just the 2nd
    # level domain 'nco.in'.  

    $domain = shift @_;
    chomp($domain);
    $offset = length($domain);

    # First case, where the letter 'n' flips to be a dot
    # Starting from the right, find the letter 'n' in the string
    $result = rindex($domain, 'n', $offset);
    # We need to keep track of the second/third dot which will keep us from looking too far back
    $result_dot = rindex($domain, '.', $offset);
    $result_dot = rindex($domain, '.', ($result_dot-1));
    $result_dot = rindex($domain, '.', ($result_dot-1)) if ( $domain =~ /[a-z0-9][a-z0-9\-]*[a-z0-9]\.[a-z]+\.[a-z][a-z]$/ );
    while ( $result != -1 ) {
        # We have a letter N present, lets form a substring up to this point
      last if ($result <= $result_dot);
        $test_string = substr($domain,0,$result) . '.' . substr($domain,($result+1),length($domain));
        $offset = $result-1;
        $result = rindex($domain, 'n', $offset);
        $test_string = &trim_FQDN($test_string);
        next unless &is_valid_domain($test_string);
        push(@found,$test_string);
    }    

    # Second case, where the dot flips to be a letter 'n'
    $test_string = '';
    $offset = length($domain);
    $result_dot = rindex($domain, '.', $offset);
    $result_dot = rindex($domain, '.', ($result_dot-1));
    $next_dot = rindex($domain, '.', ($result_dot-1));
    if ( defined $next_dot && $next_dot < 0 ) {
        $next_dot = 0;
    } else {
        $next_dot += 1;
    }
    $test_string = substr($domain,$next_dot,($result_dot-$next_dot)) . 'n' . substr($domain,($result_dot+1),length($domain)) if ($result_dot > 0);
    push(@found,$test_string) if &is_valid_domain($test_string);
    # Sometimes we need to substitute 'n' for two dots in a name
    if ( $domain =~ /[a-z0-9][a-z0-9\-]*[a-z0-9]\.[a-z]+\.[a-z][a-z]$/ ) {
        $result_dot = rindex($domain, '.', ($result_dot-1));
        $next_dot = rindex($domain, '.', ($result_dot-1));
        if ( defined $next_dot && $next_dot < 0 ) {
            $next_dot = 0;
        } else {
            $next_dot += 1;
        }
        $test_string = substr($domain,$next_dot,($result_dot-$next_dot)) . 'n' . substr($domain,($result_dot+1),length($domain)) if ($result_dot > 0);
        push(@found,$test_string) if &is_valid_domain($test_string);
    }

    return(@found);
}

sub extract_c_squats($) {
    # Declare vars
    my ( $domain,
         @found,
         $result,
         $offset,
         $test_string
    );

    # Subroutine to process a domain and pull out any bitsquat domains created as 
    # a result of the letter 'c' flipping a bit to become a '#' char.  Because this 
    # can happen anywhere, including in the subdomains, we start processing from the
    # left, just as with o-based bitsquats.

    $domain = shift @_;
    chomp($domain);
    $offset = 0;

    # Loop through the string starting from the left looking for the letter 'c' inside the domain
    $result = index($domain, 'c', $offset);
    while ( $result != -1 ) {
        # We have a letter c present, lets form a substring up to this point
        $test_string = substr($domain,0,$result);
        $offset = $result+1;
        $result = index($domain, 'c', $offset);
        # Check for validity
        next unless &is_valid_domain($test_string);
        push(@found,$test_string);
    }

    return(@found)
}

sub extract_bitsquats($) {
    # Declare vars
    my ( $domain,
         @found,
         @domain,
         @newdomain,
         $offset,
         $TLD_dot,
         $SLD_dot,
         $THLD_dot,
         $test_TLD,
         $test_SLD,
         $test_string,
         $x,
         $char,
         $newchar
    );

    # Subroutine will take domain, remove any extra hostnames from the left side
    # then process the domain from the right, substituting in the possible bitsquat
    # characters one at a time.
    
    $domain = shift @_;
    chomp($domain);
    $test_string = '';

    # Throw away any uneccesary hostnames for this part
    $domain = &trim_FQDN($domain);
    @domain = split '', $domain;

    # Compute the location of the dots.  This is done because when a bit flips
    # in the Country Code Top Level Domain (ccTLD), then the third level domain 
    # may not be necessary to include in the result.  For example:
    # if 'example.ltd.uk' becomes 'example.ltd.tk', then the 'example' subdomain 
    # is extraneous because the 'ltd.tk' domain can be registered by itself.
    $offset = length($domain);
    $TLD_dot = rindex($domain, '.', $offset);
    $SLD_dot = rindex($domain, '.', ($TLD_dot-1));
    $THLD_dot = rindex($domain, '.', ($SLD_dot-1)) if ( $SLD_dot > 0 );
    if ( defined $SLD_dot && $SLD_dot > 0 ) {
        $SLD_dot += 1;
    } else {
        $SLD_dot = 0;
    }
    if ( defined $THLD_dot && $THLD_dot > 0 ) {
        $THLD_dot += 1;
    } else {
        $THLD_dot = 0;
    }
    for ($x=$#domain; $x >= 0; $x--) {
        $char = $domain[$x];
        foreach $newchar ( @{$variants{$char}} ) {
            # create a bitsquat of the target domain by replacing one char
            @newdomain = @domain;
            $newdomain[$x] = $newchar;
            $test_string = join '', @newdomain;
            # Check to see if we are still flipping chars in the TLD
            if ($x >= $SLD_dot) {
                ($test_SLD,$test_TLD) = ($test_string =~ /\.?([a-z0-9\-]+)\.([a-z][a-z])$/);
                if ( defined $test_SLD && defined $test_TLD && ( $need_third_level{$test_TLD} || scalar grep /^$test_SLD$/, @{$valid_second_level{$test_TLD}} ) ) { 
                    $test_string = substr($test_string,$THLD_dot,(($#newdomain+1)-$THLD_dot));
                } else {
                    $test_string = substr($test_string,$SLD_dot,(($#newdomain+1)-$SLD_dot));
                }
            }
            $test_string = &trim_FQDN($test_string);
            push(@found,$test_string) if &is_valid_domain($test_string);
        }   
    }
    return(@found);
}

sub is_valid_domain($) {
    # Declare vars
    my ( $domain,
         $valid,
         $not_valid,
         $test_TLD,
         $test_SLD,
         $test_THLD
    );

    # Subroutine will check the domain name it is passed and attempt to 
    # make a determination whether the domain is valid.  Tests things like multiple
    # dots in a row, inclusion of illegal chars in the domain like '#', '/', 
    # and other generic tests for validity.

    $domain = shift @_;
    chomp($domain);
    $valid = 1;
    $not_valid = 0;

    # Must contain a dot
    return($not_valid) unless ( $domain =~ /\./ );

    # Must not begin with a dot or dash
    return($not_valid) if ( $domain =~ /^(?:\.|\-)/ );

    # Must not end with a dot or dash
    return($not_valid) if ( $domain =~ /(?:\.|\-)$/ );
 
    # Must not contain multiple dots in a row
    return($not_valid) if ( $domain =~ /\.\./ );

    # Must not have dash then dot or dot then dash
    return($not_valid) if ( $domain =~ /(\-\.|\.\-)/ );

    # Cannot have any fwd slashes in the domain itself
    return($not_valid) if ( $domain =~ /\// );

    # Cannot have any '#' chars in the domain itself
    return($not_valid) if ( $domain =~ /#/ );

    # Attempt to find a valid TLD
    ($test_TLD) = ($domain =~ /\.([a-z0-9\-]+)$/);
    return($not_valid) unless ( defined $test_TLD && $valid_tlds{$test_TLD} );
    
    # Now we know at least the TLD portion is valid
    # Do checks for special TLDs like .uk, .br where only third level registrations are accepted
    ($test_SLD) = ($domain =~ /([^.]+)\.$test_TLD$/);
    if ( $need_third_level{$test_TLD} ) {
        # Check second level to be sure its part of the set
        return($not_valid) unless ( defined $test_SLD && scalar grep /^$test_SLD$/, @{$valid_second_level{$test_TLD}} );
    }

    # Test for missing 3rd level domain name
    ($test_THLD) = ($domain =~ /([^.]+)\.$test_SLD\.$test_TLD$/);
    if ( $need_third_level{$test_TLD} || scalar grep /^$test_SLD$/, @{$valid_second_level{$test_TLD}} ) {
        return($not_valid) unless ( defined $test_THLD );
    }

    # At this point we believe the domain is okay, so return valid response
    return($valid);
}



#
# Begin main section
#

# Get command options
getopts("Rh", \%options);
if ( @ARGV == 0 || $options{h} ) {
    print "Usage:\n\n";
    print "  bitsquat_rpz.pl [ -R ] [ FQDN-List ]\n\n";
    print "This script processes an input file containing a list of\n";
    print "Fully Qualified Domain Names (FQDNs).  The output of the\n";
    print "script is a list of 1-bit variants computed from the input.\n\n";
    print "The -R option will produce output in Response Policy Zone\n";
    print "(RPZ) format.\n"; 
    exit;
}

# Open input file
open FILE, "<", $ARGV[0] || die "Cannot open input FQDN file: $!";

# List of valid TLDs
%valid_tlds = (
'ac' => 1,
'ad' => 1,
'ae' => 1,
'aero' => 1,
'af' => 1,
'ag' => 1,
'ai' => 1,
'al' => 1,
'am' => 1,
'an' => 1,
'ao' => 1,
'aq' => 1,
'ar' => 1,
'arpa' => 1,
'as' => 1,
'asia' => 1,
'at' => 1,
'au' => 1,
'aw' => 1,
'ax' => 1,
'az' => 1,
'ba' => 1,
'bb' => 1,
'bd' => 1,
'be' => 1,
'bf' => 1,
'bg' => 1,
'bh' => 1,
'bi' => 1,
'biz' => 1,
'bj' => 1,
'bm' => 1,
'bn' => 1,
'bo' => 1,
'br' => 1,
'bs' => 1,
'bt' => 1,
'bv' => 1,
'bw' => 1,
'by' => 1,
'bz' => 1,
'ca' => 1,
'cat' => 1,
'cc' => 1,
'cd' => 1,
'cf' => 1,
'cg' => 1,
'ch' => 1,
'ci' => 1,
'ck' => 1,
'cl' => 1,
'cm' => 1,
'cn' => 1,
'co' => 1,
'com' => 1,
'coop' => 1,
'cr' => 1,
'cu' => 1,
'cv' => 1,
'cw' => 1,
'cx' => 1,
'cy' => 1,
'cz' => 1,
'de' => 1,
'dj' => 1,
'dk' => 1,
'dm' => 1,
'do' => 1,
'dz' => 1,
'ec' => 1,
'edu' => 1,
'ee' => 1,
'eg' => 1,
'er' => 1,
'es' => 1,
'et' => 1,
'eu' => 1,
'fi' => 1,
'fj' => 1,
'fk' => 1,
'fm' => 1,
'fo' => 1,
'fr' => 1,
'ga' => 1,
'gb' => 1,
'gd' => 1,
'ge' => 1,
'gf' => 1,
'gg' => 1,
'gh' => 1,
'gi' => 1,
'gl' => 1,
'gm' => 1,
'gn' => 1,
'gov' => 1,
'gp' => 1,
'gq' => 1,
'gr' => 1,
'gs' => 1,
'gt' => 1,
'gu' => 1,
'gw' => 1,
'gy' => 1,
'hk' => 1,
'hm' => 1,
'hn' => 1,
'hr' => 1,
'ht' => 1,
'hu' => 1,
'id' => 1,
'ie' => 1,
'il' => 1,
'im' => 1,
'in' => 1,
'info' => 1,
'int' => 1,
'io' => 1,
'iq' => 1,
'ir' => 1,
'is' => 1,
'it' => 1,
'je' => 1,
'jm' => 1,
'jo' => 1,
'jobs' => 1,
'jp' => 1,
'ke' => 1,
'kg' => 1,
'kh' => 1,
'ki' => 1,
'km' => 1,
'kn' => 1,
'kp' => 1,
'kr' => 1,
'kw' => 1,
'ky' => 1,
'kz' => 1,
'la' => 1,
'lb' => 1,
'lc' => 1,
'li' => 1,
'lk' => 1,
'lr' => 1,
'ls' => 1,
'lt' => 1,
'lu' => 1,
'lv' => 1,
'ly' => 1,
'ma' => 1,
'mc' => 1,
'md' => 1,
'me' => 1,
'mg' => 1,
'mh' => 1,
'mil' => 1,
'mk' => 1,
'ml' => 1,
'mm' => 1,
'mn' => 1,
'mo' => 1,
'mobi' => 1,
'mp' => 1,
'mq' => 1,
'mr' => 1,
'ms' => 1,
'mt' => 1,
'mu' => 1,
'museum' => 1,
'mv' => 1,
'mw' => 1,
'mx' => 1,
'my' => 1,
'mz' => 1,
'na' => 1,
'name' => 1,
'nc' => 1,
'ne' => 1,
'net' => 1,
'nf' => 1,
'ng' => 1,
'ni' => 1,
'nl' => 1,
'no' => 1,
'np' => 1,
'nr' => 1,
'nu' => 1,
'nz' => 1,
'om' => 1,
'org' => 1,
'pa' => 1,
'pe' => 1,
'pf' => 1,
'pg' => 1,
'ph' => 1,
'pk' => 1,
'pl' => 1,
'pm' => 1,
'pn' => 1,
'post' => 1,
'pr' => 1,
'pro' => 1,
'ps' => 1,
'pt' => 1,
'pw' => 1,
'py' => 1,
'qa' => 1,
're' => 1,
'ro' => 1,
'rs' => 1,
'ru' => 1,
'rw' => 1,
'sa' => 1,
'sb' => 1,
'sc' => 1,
'sd' => 1,
'se' => 1,
'sg' => 1,
'sh' => 1,
'si' => 1,
'sj' => 1,
'sk' => 1,
'sl' => 1,
'sm' => 1,
'sn' => 1,
'so' => 1,
'sr' => 1,
'st' => 1,
'su' => 1,
'sv' => 1,
'sx' => 1,
'sy' => 1,
'sz' => 1,
'tc' => 1,
'td' => 1,
'tel' => 1,
'tf' => 1,
'tg' => 1,
'th' => 1,
'tj' => 1,
'tk' => 1,
'tl' => 1,
'tm' => 1,
'tn' => 1,
'to' => 1,
'tp' => 1,
'tr' => 1,
'travel' => 1,
'tt' => 1,
'tv' => 1,
'tw' => 1,
'tz' => 1,
'ua' => 1,
'ug' => 1,
'uk' => 1,
'us' => 1,
'uy' => 1,
'uz' => 1,
'va' => 1,
'vc' => 1,
've' => 1,
'vg' => 1,
'vi' => 1,
'vn' => 1,
'vu' => 1,
'wf' => 1,
'ws' => 1,
'xn--0zwm56d' => 1,
'xn--11b5bs3a9aj6g' => 1,
'xn--3e0b707e' => 1,
'xn--45brj9c' => 1,
'xn--80akhbyknj4f' => 1,
'xn--80ao21a' => 1,
'xn--90a3ac' => 1,
'xn--9t4b11yi5a' => 1,
'xn--clchc0ea0b2g2a9gcd' => 1,
'xn--deba0ad' => 1,
'xn--fiqs8s' => 1,
'xn--fiqz9s' => 1,
'xn--fpcrj9c3d' => 1,
'xn--fzc2c9e2c' => 1,
'xn--g6w251d' => 1,
'xn--gecrj9c' => 1,
'xn--h2brj9c' => 1,
'xn--hgbk6aj7f53bba' => 1,
'xn--hlcj6aya9esc7a' => 1,
'xn--j6w193g' => 1,
'xn--jxalpdlp' => 1,
'xn--kgbechtv' => 1,
'xn--kprw13d' => 1,
'xn--kpry57d' => 1,
'xn--lgbbat1ad8j' => 1,
'xn--mgb9awbf' => 1,
'xn--mgbaam7a8h' => 1,
'xn--mgbayh7gpa' => 1,
'xn--mgbbh1a71e' => 1,
'xn--mgbc0a9azcg' => 1,
'xn--mgberp4a5d4ar' => 1,
'xn--mgbx4cd0ab' => 1,
'xn--o3cw4h' => 1,
'xn--ogbpf8fl' => 1,
'xn--p1ai' => 1,
'xn--pgbs0dh' => 1,
'xn--s9brj9c' => 1,
'xn--wgbh1c' => 1,
'xn--wgbl6a' => 1,
'xn--xkc2al3hye2a' => 1,
'xn--xkc2dl3a5ee0h' => 1,
'xn--yfro4i67o' => 1,
'xn--ygbi2ammx' => 1,
'xn--zckzah' => 1,
'xxx' => 1,
'ye' => 1,
'yt' => 1,
'za' => 1,
'zm' => 1,
'zw' => 1
);

# Domains where only third level registrations are ollowed 
# For all of the domains listed here there must be something at the third level
%need_third_level = (
'al' => 1,
'ar' => 1,
'au' => 1,
'bd' => 1,
'bn' => 1,
'br' => 1,
'bt' => 1,
'bv' => 1,
'cs' => 1,
'cy' => 1,
'dd' => 1,
'er' => 1,
'et' => 1,
'fk' => 1,
'gb' => 1,
'gh' => 1,
'gn' => 1,
'gu' => 1,
'id' => 1,
'il' => 1,
'jm' => 1,
'ke' => 1,
'kh' => 1,
'kp' => 1,
'kw' => 1,
'lb' => 1,
'lr' => 1,
'ls' => 1,
'mm' => 1,
'mq' => 1,
'mt' => 1,
'mz' => 1,
'nl' => 1,
'np' => 1,
'nz' => 1,
'om' => 1,
'pa' => 1,
'pg' => 1,
'py' => 1,
'qa' => 1,
'sb' => 1,
'sj' => 1,
'sv' => 1,
'sx' => 1,
'sz' => 1,
'th' => 1,
'tz' => 1,
'uk' => 1,
'va' => 1,
've' => 1,
'ye' => 1,
'yu' => 1,
'za' => 1,
'zm' => 1,
'zw' => 1
);

# Possible bitsquats
# lowercase and uppercase characters have slightly different bitsquats. 
# The letters are the same, but things like punctuation (.) are not bitsquats in uppercase
%variants = (
'a' => ['c', 'e', 'i', 'q'],
'b' => ['c', 'f', 'j', 'r'],
'c' => ['b', 'a', 'g', 'k', 's', '#'],
'd' => ['e', 'f', 'l', 't'],
'e' => ['d', 'g', 'a', 'm', 'u'],
'f' => ['g', 'd', 'b', 'n', 'v'],
'g' => ['f', 'e', 'c', 'o', 'w'],
'h' => ['i', 'j', 'l', 'x'],
'i' => ['h', 'k', 'm', 'a', 'y'],
'j' => ['k', 'h', 'n', 'b', 'z'],
'k' => ['j', 'i', 'o', 'c'],
'l' => ['m','n','h','d'],
'm' => ['l', 'o', 'i', 'e', '-'],
'n' => ['o', 'l', 'j', 'f', '.'],
'o' => ['n', 'm', 'k', 'g', '/'],
'p' => ['q', 'r', 't', 'x', '0'],
'q' => ['p', 's', 'u', 'y', 'a', '1'],
'r' => ['s', 'p', 'v', 'z', 'b', '2'],
's' => ['r', 'q', 'w', 'c', '3'],
't' => ['u', 'v', 'p', 'd', '4'],
'u' => ['t', 'w', 'q', 'e', '5'],
'v' => ['w', 't', 'r', 'f', '6'],
'w' => ['v', 'u', 's', 'g', '7'],
'x' => ['y', 'z' , 'p', 'h', '8'],
'y' => ['x', 'q', 'i', '9'],
'z' => ['x', 'r', 'j'],
'-' => ['m'],
'0' => ['1', '2', '4', '8', 'p'],
'1' => ['0', '3', '5', '9', 'q'],
'2' => ['3', '0', '6', 'r'],
'3' => ['2', '1', '7', 's'],
'4' => ['5', '6', '0', 't'],
'5' => ['4', '7', '1', 'u'],
'6' => ['7', '4', '2', 'v'],
'7' => ['6', '5', '3', 'w'],
'8' => ['9', '0', 'x'],
'9' => ['8', '1', 'y'],
);

# This is a list of the 2nd level domains under each ccTLD under which 
# official 3rd level domains can be legitimately registered through the NIC
%valid_second_level = (
'ac' => ['com','net','org','edu','mil','gov'],
'ad' => ['nom'],
'ae' => ['co','net','org','sch','ac','gov','mil','pro','name','aeda'],
'af' => ['com','net','org','edu','gov'],
'ag' => ['com','co','net','org','nom'],
'ai' => ['com','net','org','off'],
'al' => ['uniti','tirana','soros','upt','inima','com','net','org','mil','gov','edu','webhost','host','akep'],
'am' => ['co','net','org'],
'an' => ['com','net','org','edu'],
'ao' => ['co','go','gv','ed','it','pb'],
'aq' => [],
'ar' => ['com','net','org','edu','gob','gov','int','mil','tur','argentina','congresodelalengua3','educ','gobiernoelectronico','mecon','nacion','promocion','retina','uba'],
'as' => ['gov'],
'at' => ['co','or','priv','ac','gv','donteat','many'],
'au' => ['com','net','org','edu','gov','asn','id','csiro','info','conf','oz','act','nsw','nt','qld','sa','tas','vic','wa','archie','conf','gw','info','otc','telememo'],
'aw' => ['com','setar'],
'ax' => [],
'az' => ['com','net','int','gov','org','edu','info','pp','mil','name','pro','biz'],
'ba' => ['co','com','org','net','edu','gov','mil','untz','unmo','unze','unbi','web','rs'],
'bb' => ['co','com','biz','edu','gov','info','net','org','store','tv'],
'bd' => ['com','net','org','edu','gov','mil','ac'],
'be' => ['ac','dns'],
'bf' => ['gov'],
'bg' => ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','0','1','2','3','4','5','6','7','8','9','register'],
'bh' => ['com','net','org','gov','info','cc','edu'],
'bi' => ['co','com','or','org','edu'],
'bj' => ['com','edu','gouv','gov','asso','barreau'],
'bm' => ['com','edu','net','org','gov'],
'bn' => ['com','edu','gov','net','org'],
'bo' => ['com','net','org','tv','mil','int','gob','edu'],
'br' => ['adm','adv','agr','am','arq','art','ato','b','bio','blog','bmd','cim','cng','cnt','com','coop','ecn','edu','eng','esp','etc','eti','far','flog','fm','fnd','fot','fst','g12','ggf','gpv','imb','ind','inf','jor','jus','leg','lel','mat','med','mil','mus','net','nom','not','ntr','odo','ppg','pro','psc','psi','qsl','radio','rec','slg','srv','taxi','teo','tmp','trd','tur','tv','vet','vlog','wiki','zlg'],
'bs' => ['com','net','org','edu','gov','we'],
'bt' => ['com'],
'bv' => [],
'bw' => ['co','org'],
'by' => ['com','net','minsk'],
'bz' => ['com','net','org','gov','edu'],
'ca' => ['ab','bc','mb','nb','nf','nl','ns','nt','nu','on','pe','qc','sk','yk','gc','mil','cira'],
'cc' => ['com','net','cc','cu','edu','org'],
'cd' => ['com','net','org'],
'cf' => [],
'cg' => [],
'ch' => ['ch','02'],
'ci' => ['com','co','net','org','or','edu','ed','ac','go','asso','int','aeroport','presse'],
'ck' => ['co','org','edu','gov','net','gen','biz','info'],
'cl' => [],
'cm' => ['co','com','net','netcom'],
'cn' => ['ac','com','edu','gov','mil','net','org','ah','bj','cq','fj','gd','gs','gz','gx','ha','hb','he','hi','hk','hl','hn','jl','js','jx','ln','mo','nm','nx','qh','sc','sd','sh','sn','sx','tj','tw','xj','xz','yn','zj'],
'co' => ['e','com','org','edu','gov','net','mil','nom','go'],
'cr' => ['ac','co','ed','fi','go','or','sa'],
'cu' => ['com','edu','org','net','gov','inf'],
'cv' => ['net','gov','org','edu','int','publ','dns','com','come'],
'cx' => [],
'cy' => ['ac','net','gov','org','pro','name','ekologes','tm','ltd','biz','press','parliament','com'],
'cz' => [],
'de' => ['com'],
'dj' => [],
'dk' => ['co','biz'],
'dm' => ['com','net','org'],
'do' => ['art','com','edu','gob','gov','mil','net','org','sld','web'],
'dz' => ['com','org','net','gov','edu','asso','pol','art'],
'ec' => ['com','info','net','fin','med','pro','org','edu','gob','gov','mil'],
'ee' => ['com','pri','fie','med'],
'eg' => ['com','edu','eun','gov','info','mil','name','net','org','sci','tv'],
'er' => ['com','edu','gov','mil','net','org','ind'],
'es' => ['com','nom','org','gob','edu'],
'et' => ['com','gov','org','edu','net','biz','name','info'],
'eu' => ['europa'],
'fi' => [],
'fj' => ['ac','biz','com','info','mil','name','net','org','pro'],
'fk' => ['co','org','gov','ac','nom','net'],
'fm' => [],
'fo' => ['com'],
'fr' => ['asso','com','gouv','nom','tm'],
'ga' => ['md','press','gout','go','org','or','com','co','edu','ed','ac','net','aeroport','int'],
'gb' => ['hmg'],
'gd' => [],
'ge' => ['com','net','org','gov','edu','pvt','mil'],
'gf' => [],
'gg' => ['co','net','org'],
'gh' => ['com','edu','gov','org','mil'],
'go' => ['com','ltd','gov','mod','edu','org','ajuntament'],
'gl' => ['goo','co','com','net','org','edu'],
'gm' => [],
'gn' => ['com','ac','gov','org','net'],
'gp' => ['com','net','mobi','edu','asso','org'],
'gq' => [],
'gr' => ['com','net','org','edu','gov','mil','mod','sch','forth','co'],
'gs' => [],
'gt' => ['com','net','edu','gob','org','mil','ind'],
'gu' => ['com','net','gov','org','edu'],
'gw' => [],
'gy' => ['co','com','org','net','edu','gov'],
'hk' => ['com','org','net','edu','gov','idv'],
'hm' => ['my','orz'],
'hn' => ['com','net','org','edu','gob'],
'hr' => ['dns','iz','from','com'],
'ht' => ['adult','art','asso','com','firm','info','net','org','person','pol','pro','del','shop'],
'hu' => ['2000','agar','bolt','casino','city','co','erotica','erotika','film','forum','games','hotel','info','ingatlan','jogasz','konyvelo','lakes','media','news','org','priv','reklam','sex','shop','sport','suli','szex','tm','tozsde','utazas','video'],
'id' => ['ac','co','net','or','web','ssh','mil','go','my','biz'],
'ie' => ['gov','irlgov','ul'],
'il' => ['ac','co','org','net','k12','gov','muni','ldf'],
'im' => ['co','net','org','ac','com','gov'],
'in' => ['co','firm','net','org','gen','ind','ac','edu','res','ernet','gov','mil'],
'io' => [],
'iq' => ['com','net','org','gov','mil','edu'],
'ir' => ['co','net','org','sch','ac','ld','gov'],
'is' => [],
'it' => ['gov','edu','pg','co','bz'],
'je' => ['co','net','org'],
'jm' => ['com','net','org','edu','gov','mil'],
'jo' => ['com','edu','gov','mil','name','net','org'],
'jp' => ['ac','ad','co','ed','go','gr','lg','ne','or','tokyo'],
'ke' => ['co','or','ne','go','ac','sc','me','mobi','info'],
'kg' => ['org','net','com','edu','gov','mil'],
'kh' => ['per','com','edu','gov','mil','net','org'],
'ki' => ['com','biz','net','info','org','gov','edu','mob','tel'],
'km' => ['com','coop','lasso','nom','presse','tm','médecin','notaires','pharmaciens','vétérinaire','edu','gouv','mil'],
'kn' => ['net','org','edu','gov','dot','com'],
'kp' => ['com','org','rep'],
'kr' => ['co','ne','or','re','pe','go','mil','ac','hs','ms','es','sc','kg'],
'kw' => ['edu','com','net','org','gov'],
'ky' => ['com','org','net','edu','gov'],
'kz' => ['com','edu','gov','mil','net','org'],
'la' => [],
'lb' => ['com','edu','gov','net','org'],
'lc' => ['co','com','l','net','p','org'],
'li' => [],
'lk' => ['gov','sch','net','int','com','org','edu','ngo','soc','web','ltd','assn','grp','hotel'],
'lr' => ['com','edu','gov','org','net'],
'ls' => ['ac','co','gov','net','org','parliament'],
'lt' => [],
'lu' => [],
'lv' => ['com','edu','gov','org','mil','id','net','ask','conf'],
'ly' => ['com','net','gov','plc','edu','sch','med','org','id'],
'ma' => ['co','net','org','gov','press','ac'],
'mc' => ['tm','asso'], 
'md' => [],
'me' => ['gov','ac','nic','co','net','org','edu','priv','its'],
'mg' => ['com','org','nom','gov','prd','tm','edu','mil','in'],
'mh' => ['net'],
'mk' => ['com','org','net','edu','gov','inf','name','pro'],
'ml' => ['com','net','org','edu','gov','presse'],
'mm' => ['com','net','edu','org','gov'],
'mn' => ['gov','edu','org'],
'mo' => ['com','edu','gov','net','org'],
'mp' => ['chi','gov','org','co','get'],
'mq' => [],
'mr' => ['gov'],
'ms' => ['co','com','org'],
'mt' => ['com','org','net','edu','gov'],
'mu' => ['com','net','org','gov','ac','co','or'],
'mv' => ['aero','biz','com','coop','edu','gov','info','int','mil','museum','name','net','org','pro'],
'mw' => ['ac','co','com','coop','edu','gov','int','museum','net','org'],
'mx' => ['com','net','org','edu','gob'],
'my' => ['com','net','org','gov','edu','mil','name'],
'mz' => ['adv','ac','co','org','gov','edu'],
'na' => ['co','com','org'],
'nc' => ['asso','com','nom'],
'ne' => [],
'nf' => ['com','net','arts','store','web','firm','info','other','per','rec'],
'ng' => ['com','org','gov','edu','net','sch','name','mobi','mil'],
'ni' => ['co','com','gob','ac','edu','nom','net','mil'],
'nl' => ['sidn','123','co','com','net'],
'no' => ['priv','sep','fhb','folkebibl','fylkesbibl','herad','idrett','kommune','mil','museum','uenorge','stat','vgs','www','oslo','nt','co','norid'],
'np' => ['com','edu','gov','mil','net','org','name'],
'nr' => ['edu','gov','biz','info','net','org','com','co'],
'nu' => [],
'nz' => ['ac','co','geek','gen','kiwi','maoiri','net','org','school','cir','govt','iwi','parliament','mil','health'],
'om' => ['co','com','org','net','edu','gov','museum','pro','med'],
'pa' => ['net','com','ac','sld','gob','edu','org','abo','ing','med','nom'],
'pe' => ['edu','gob','nom','mil','sld','org','com','net','punto'],
'pf' => ['com','opt'],
'pg' => ['com','net','ac','gov','mil','org'],
'ph' => ['com','net','org','gov','mil','edu'],
'pk' => ['com','net','edu','org','fam','biz','web','gov','gok','gob','gkp','gop','gos','gog'],
'pl' => ['com','net','art','edu','org','ngo','biz','gov','info','mil','elk','gda','nysa','pila','torun','waw'],
'pm' => [],
'pn' => ['in','co','eu','org','net','government'],
'pr' => ['biz','com','edu','gov','info','isla','name','net','org','pro','est','prof','ac'],
'ps' => ['com','net','org','edu','gov','plo','sec'],
'pt' => ['com','edu','gov','int','net','nome','org','publ','dns'],
'pw' => [],
'py' => ['org','edu','mil','gov','net','com','coop'],
'qa' => ['com','net','name','gov','mil','org','edu','sch'],
're' => ['lasso','nom','com'],
'ro' => ['com','arts','firm','info','nom','nom','nt','org','rem','store','tm'],
'rs' => ['co','org','edu','ac','gov','in','blogge','rnids'],
'ru' => ['com','net','org','pp','cctld','ac','int','mil','gov','cap','nov','rnd','nnov','cbg','spb','bit','jar','msk','tula','tlt','snz','nkz','kms','cmw'],
'rw' => ['gov','net','edu','ac','com','co','int','mil','gouv'],
'sa' => ['com','edu','sch','med','gov','net','org','pub'],
'sb' => ['com','net','edu','org','gov'],
'sc' => ['com','net','edu','gov','org'],
'sd' => ['com','net','org','edu','med','tv','gov','info'],
'se' => ['a','b','ac','bd','c','d','e','f','g','h','i','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','org','parti','pp','press'],
'sg' => ['com','net','org','gov','edu','per','idn'],
'sh' => ['co','com','org','gov','edu','net','nom'],
'si' => ['ae','at','cn','co','de','uk','us'],
'sj' => [],
'sk' => ['gov'],
'sl' => ['com','net','org','edu','gov'],
'sm' => [],
'sn' => ['com','org','art','edu','gouv','perso','univ'],
'so' => ['com','net','org'],
'sr' => [],
'st' => ['gov','saotome','principe','consul ado','embaixada','org','edu','net','com','store','mil','co'],
'su' => [],
'sv' => ['com','red','org','gob','edu'],
'sx' => [],
'sy' => ['com','info','net','org','edu','gov','mil','news'],
'sz' => ['co','ac','org'],
'tc' => [],
'td' => [],
'tf' => ['eu','us','net','edu','www','com','w3','tm','fr'], 
'tg' => [],
'th' => ['ac','co','go','mi','or','net','in'],
'tj' => ['ac','aero','biz','co','com','coop','dyn','edu','go','gov','info','int','mil','museum','my','name','net','nic','org','per','pro','test','web'],
'tk' => [],
'tl' => ['gov','com','net','org','de','na','in'],
'tm' => [],
'tn' => ['com','ens','fin','gov','ind','intl','nat','net','org','info','perso','tourism','edunet','rnrt','rns','mincom','agrinet','defense'],
'to' => [],
'tp' => ['org'],
'tr' => ['nc','com','gen','org','biz','info','av','dr','bel','tsk','bbs','k12','edu','name','net','gov','pol','web','tel','tv'],
'tt' => ['co','com','org','net','biz','info','pro','int','coop','jobs','mobi','travel','museum','aero','cat','tel','name','mil','edu','gov'],
'tv' => ['co'],
'tw' => ['com','net','org','edu','gov','mil','idv','game','ebiz','club'],
'tz' => ['co','ac','go','or','mil','sc','ne','hotel','mobi','tv','info','me'],
'ua' => ['com','co','edu','gov','net','edu','in','at','pp','org','ck','cn','cv','dp','dn','if','in','kh','ks','km','kr','lg','lt','lviv','mk','od','pp','pl','rv','sm','te','uz','vn','yalta','zp','zt'],
'ug' => ['ac','co','com','go','ne','or','org','sc'],
'uk' => ['ac','co','gov','judiciary','ltd','me','mod','net','nhs','nic','org','parliament','plc','police','sch','bl','jet','nls','mil'],
'us' => ['fed','dni','isa','kids','nsn','al','ak','az','ar','ca','co','ct','de','fl','ga','hi','id','il','in','ia','ks','ky','la','me','md','ma','mi','mn','ms','mo','mt','ne','nv','nh','nj','nm','ny','nc','nd','oh','ok','or','pa','ri','sc','sd','tn','tx','ut','vt','va','wa','wv','wi','wy','dc'], 
'uy' => ['com','edu','gub','net','mil','org'],
'uz' => ['co','com','org','ac','aero','biz','coop','edu','info','int','museum','name','net','pro'],
'va' => ['www','vatican','photo','extranet','news','mailservice','pcf'],
'vc' => ['com','net','org'],
've' => ['com','net','org','info','co','web','gob','edu','mil','tec'],
'vg' => [],
'vi' => ['co','org','com','net','k12'],
'vn' => ['com','biz','edu','gov','net','org','int','ac','pro','info','health','name'],
'vu' => ['com','gov','net','org','edu'],
'wf' => [],
'ws' => ['org','gov','edu'],
'ye' => ['com','co','ltd','me','net','org','plc','gov'],
'yt' => [],
'za' => ['co','ac','edu','gov','law','mil','nom','org','school','alt','work','ngo','tm','web','agric','grondar','inca','nis'],
'zm' => ['co','ac','com','edu','gov','net','org','sch'],
'zw' => ['co','ac','org']
);

# Tie our hash to keep the order, so things look nice when we print them out
tie %found, 'Tie::IxHash';

# Loop through the FQDNs we are being passed
while (<FILE>) {
    # We are expecting to be passed a FQDN to check for bitsquats
    $FQDN = $_;
    chomp($FQDN);
    $FQDN = lc($FQDN);

    # Skip IP addresses
    next if ( $FQDN =~ /^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/ );

    # Pass the FQDN to a subroutine to check o-based bitsquats 
    @o_squats = &extract_o_squats($FQDN);
    foreach $name ( @o_squats ) {
        $found{$name} = 1;
    }

    # Pass the FQDN to a subroutine to check n-based bitsquats
    @n_squats = &extract_n_squats($FQDN);
    foreach $name ( @n_squats ) {
        $found{$name} = 1;
    }

    # Pass the FQDN to a subroutine to check c-based bitsquats
    @c_squats = &extract_c_squats($FQDN);
    foreach $name ( @c_squats ) {
        $found{$name} = 1;
    }

    # Pass the FQDN to a subroutine to check the regular 2nd and TLD bitsquats
    @bit_squats = &extract_bitsquats($FQDN);
    foreach $name ( @bit_squats ) {
        $found{$name} = 1;
    }
}

# Print output
# Check to see if option has been set for RPZ output, otherwise just print a list
if ($options{'R'}) {
    print "; Bitsquat Response Policy Zone\n"; 
    print "\$TTL 10\n";
    print "@	IN    SOA  bitsquat.rpz. bitsquat.rpz.  (\n";
    print "                      1    ; serial\n";
    print "                      3H   ; refresh\n";
    print "                      1H   ; retry\n";
    print "                      1D   ; expiry\n";
    print "                      1H ) ; minimum\n";
    print "	IN    NS    localhost.\n\n";
    foreach $FQDN ( keys %found ) {
        print $FQDN . "\tCNAME\t.\n";
        print "*." . $FQDN . "\tCNAME\t.\n";
    }
} else {
    foreach $FQDN ( keys %found ) {
        print "$FQDN\n";
    }
}

exit;

