/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

* Cory, if the program won't run due to a NumberFormatException, please

* change DELIMITER to ",". I have it as ", " because that is how all

* of our other input files are structured.

*/

import java.io.* ;

import java.util.* ;

import org.paukov.combinatorics.Factory ;

import org.paukov.combinatorics.Generator ;

import org.paukov.combinatorics.ICombinatoricsVector ;

import org.paukov.combinatorics.util.* ;

public class apriori

{

public static void main ( String [ ] args ) throws Exception

{

/* Take in the input filename, output filename, min support, and min confidence as

command line args. */

BufferedReader inputFile = new BufferedReader ( new FileReader ( args [ 0 ] ) ) ;

BufferedWriter outputFile = new BufferedWriter ( new FileWriter ( args [ 1 ] ) ) ;

double minSupportPercent = Double . parseDouble ( args [ 2 ] ) ;

double minConfidence = Double . parseDouble ( args [ 3 ] ) ;

// transactions: stores product ids: itemsets

// supportCounts: stores itemsets: support counts. This is a double to make debugging easier.

// supportPercents: stores itemsets: support percents

// verifiedFreqItemSets: pruned version of supportPercents (with all values higher than minimum support percent)

// productCombs: Stores all nonrepeating combinations of two or more products in the pruned supportPercents

// associationRules: Stores all association rules generated from the combinations in productCombs.

ArrayList < TreeSet < Integer >> transactions = new ArrayList < TreeSet < Integer >> ( ) ;

Set < TreeSet < Integer >> masterItemSet = new TreeSet < TreeSet < Integer >> ( ) ;

Map < Integer , HashMap < TreeSet < Integer > , Double >> verifiedFreqItemSets = new HashMap < Integer , HashMap < TreeSet < Integer > , Double >> ( ) ;

Map < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > associationRulesSupport = new HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > ( ) ; // Antecedent as key, consequent as value

Map < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > associationRulesConfidence = new HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > ( ) ; // Antecedent as key, consequent as value

/* Read items into a Hashmap and map IDs to a set of items */

transactions = ( ArrayList < TreeSet < Integer >> ) csvToList ( inputFile ) ;

masterItemSet = generateMasterItemSet ( transactions ) ;

Map < TreeSet < Integer > , Double > masterItemSetMap = new HashMap < TreeSet < Integer > , Double > ( ) ;

for ( TreeSet < Integer > item : masterItemSet )

masterItemSetMap. put ( item, 100.0 ) ; // Putting 100 in guarantees that we will generate all 2-sets and saves us time from writing another tedious method.

/* Below, get the relevant maps as a triple since Java is dumb and can't return multiple structures:

* Entry 1: The verified item sets in the form Cardinality -> (verifiedSets -> SupportPercent)

* Entry 2: The association rules map as (Antecedent -> Consequent) -> SupportPercent

* Entry 3: The association rules map as (Antecedent -> Consequent) -> Confidence

*/

Triple < HashMap < Integer , HashMap < TreeSet < Integer > , Double >> , HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > , HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double >> threeMaps = new Triple < HashMap < Integer , HashMap < TreeSet < Integer > , Double >> , HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > , HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double >> ( ) ;

threeMaps. set ( getVerifiedItemSets ( masterItemSetMap, transactions, minSupportPercent, minConfidence, ( HashMap < Integer , HashMap < TreeSet < Integer > , Double >> ) verifiedFreqItemSets, associationRulesSupport, associationRulesConfidence, 2 ) ) ;

// Set the maps.

verifiedFreqItemSets = threeMaps. get1 ( ) ;

associationRulesSupport = threeMaps. get2 ( ) ;

associationRulesConfidence = threeMaps. get3 ( ) ;

writeMapsToFile ( verifiedFreqItemSets, associationRulesSupport, associationRulesConfidence, outputFile ) ;

}

/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/

/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Helper Methods Start Here~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/

/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/

private static List < TreeSet < Integer >> csvToList ( BufferedReader inputFile ) throws Exception

{

List < TreeSet < Integer >> productSets = new ArrayList < TreeSet < Integer >> ( ) ;

final int START_POSITION = 1 ;

final String DELIMITER = ", " ;

while ( inputFile. ready ( ) )

{

String inputLine = inputFile. readLine ( ) ;

String [ ] splitLine = inputLine. split ( DELIMITER ) ;

Integer [ ] splitLineInt = arrayToInteger ( splitLine ) ; //See accompanying method

TreeSet < Integer > products = new TreeSet < Integer > ( ) ; //The "value" in the transactions HashMap

for ( int i = START_POSITION ; i < splitLineInt. length ; i ++ ) //Start at 1 so we don't add the ID to the SortedSet

{

products. add ( splitLineInt [ i ] ) ;

}

productSets. add ( products ) ;

}

inputFile. close ( ) ;

return productSets ;

}

private static void writeMapsToFile ( Map < Integer , HashMap < TreeSet < Integer > , Double >> verifiedFreqItemSets, Map < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > associationRulesSupport, Map < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > associationRulesConfidence, BufferedWriter outputFile ) throws Exception

{

// Write all of the sets and their support counts to the file.

String vfiString = new String ( verifiedFreqItemSets. toString ( ) ) ;

try

{

vfiString = vfiString. substring ( 4 , vfiString. length ( ) - 2 ) ; // Gets rid of the side braces

}

catch ( StringIndexOutOfBoundsException e )

{

System . out . println ( "Error: No file written because no verified item sets were found." ) ;

outputFile. close ( ) ;

System . exit ( 0 ) ;

}

String [ ] temp = vfiString. split ( "[0-9]= \\ {| \\ }|[0-9]+= \\ {" ) ; // This regex splits the string as needed.

StringBuilder cleanedvfi = new StringBuilder ( ) ;

for ( int i = 0 ; i < temp. length ; i ++ )

cleanedvfi. append ( temp [ i ] ) ;

cleanedvfi. deleteCharAt ( 0 ) ; // Gets rid of starting [

String [ ] splitStuff = cleanedvfi. toString ( ) . split ( " \\ ]=|, \\ [" ) ;

for ( int i = 0 ; i < splitStuff. length - 1 ; i += 2 ) // Iterate every other number so we can write one line per iteration

{

outputFile. write ( "set, " + splitStuff [ i + 1 ] + ", " + splitStuff [ i ] ) ;

if ( i != splitStuff. length - 2 )

outputFile. newLine ( ) ;

}

// At this point, the VIF map has been successfully written to the file.

// Now, write the rules and confidences.

String rulesStringSupport = new String ( associationRulesSupport. toString ( ) ) ;

String rulesStringConfidence = new String ( associationRulesConfidence. toString ( ) ) ;

String [ ] splitStuffSupport = rulesStringSupport. split ( " \\ { \\ { \\ [| \\ ]= \\ [| \\ ] \\ }=|, \\ { \\ [| \\ }$" ) ;

String [ ] splitStuffConfidence = rulesStringConfidence. split ( " \\ { \\ { \\ [| \\ ]= \\ [| \\ ] \\ }=|, \\ { \\ [| \\ }$" ) ;

for ( int i = 1 ; i < splitStuffSupport. length - 2 ; i += 3 ) // Start at 1 to avoid working with blank string

{

outputFile. newLine ( ) ;

outputFile. write ( "rule, " + splitStuffSupport [ i + 2 ] + ", " + splitStuffConfidence [ i + 2 ] ) ;

outputFile. write ( ", " + splitStuffSupport [ i ] + ", => ," + splitStuffSupport [ i + 1 ] ) ;

}

outputFile. close ( ) ;

}

private static Set < TreeSet < Integer >> generateMasterItemSet ( List < TreeSet < Integer >> purchasedItemSets )

{

Set < TreeSet < Integer >> masterItemSet = new HashSet < TreeSet < Integer >> ( ) ;

for ( TreeSet < Integer > itemSet : purchasedItemSets )

{

for ( Integer item : itemSet )

masterItemSet. add ( new TreeSet < Integer > ( Arrays . asList ( item ) ) ) ;

}

return ( masterItemSet ) ;

}

// Get the support counts of a single item set in its parent set

private static Integer getSupportCount ( TreeSet < Integer > comb, ArrayList < TreeSet < Integer >> combs )

{ // * Maybe add a way to eliminate repeats?

//

// * Remember that the support percentage is for the ORIGINAL list of transactions. So if there are duplicates,

// the duplicates will not be in supports because we obviously can't have duplicate keys. combs.size() is the number

// of elements in the passed HashMap, meaning it's the number of elements, including duplicate keys. Just a note for

// debugging.

int support = 0 ;

for ( TreeSet < Integer > possibleSuperSet : combs )

{ //If comb2 contains comb1, increment support by 1.

if ( possibleSuperSet. containsAll ( comb ) ) support ++; //if the set we're comparing our current set to contains it, increment it by 1.

}

return ( support ) ;

}

private static Double getSupportPercent ( TreeSet < Integer > comb, ArrayList < TreeSet < Integer >> combs )

{

//

// * Remember that the support percentage is for the ORIGINAL list of transactions. So if there are duplicates,

// the duplicates will not be in supports because we obviously can't have duplicate keys. combs.size() is the number

// of elements in the passed HashMap, meaning it's the number of elements, including duplicate keys. Just a note for

// debugging.

double support = 0 ;

for ( TreeSet < Integer > possibleSuperSet : combs )

{ //If comb2 contains comb1, increment support by 1.

if ( possibleSuperSet. containsAll ( comb ) ) support ++; //if the set we're comparing our current set to contains it, increment it by 1.

}

return ( support * 100 / combs. size ( ) ) ;

}

// Start by passing in n=2. We will recursively generate n-sets from the previous run.

/* In the end, we will return:

1. Map<TreeSet<Integer>, Double> for itemSets -> supportPercent

2. Map<HashMap<TreeSet<Integer>, TreeSet<Integer>>, Double> for (Antecedent -> Consequent) -> Support

3. Map<HashMap<TreeSet<Integer>, TreeSet<Integer>>, Double> for (Antecedent -> Consequent) -> Confidence

*/

private static Triple < HashMap < Integer , HashMap < TreeSet < Integer > , Double >> , HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > , HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double >> getVerifiedItemSets ( Map < TreeSet < Integer > , Double > vifPrev, ArrayList < TreeSet < Integer >> transactions, double minSupportPercent, double minConfidence, Map < Integer , HashMap < TreeSet < Integer > , Double >> verifiedFreqItemSets, Map < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > associationRulesSupport, Map < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > associationRulesConfidence, int n )

{

// This is the structure we're returning.

Triple < HashMap < Integer , HashMap < TreeSet < Integer > , Double >> , HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > , HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double >> returnMaps = new Triple < HashMap < Integer , HashMap < TreeSet < Integer > , Double >> , HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > , HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double >> ( ) ;

// Generate n-sets using (n-1)-sets generated using the previous run (subset property)

TreeSet < Integer > currentItems = ( TreeSet < Integer > ) flattenSet ( vifPrev. keySet ( ) ) ;

Set < TreeSet < Integer >> nSets = getAllNSets ( currentItems, n ) ;

Map < TreeSet < Integer > , Double > vifCurrent = new HashMap < TreeSet < Integer > , Double > ( ) ;

if ( nSets. isEmpty ( ) ) // Base case: We can't generate any more n-sets, so the algorithm ends

{ // Return an incredibly annoying joint data structure.

returnMaps. set1 ( ( HashMap < Integer , HashMap < TreeSet < Integer > , Double >> ) verifiedFreqItemSets ) ;

returnMaps. set2 ( ( HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > ) associationRulesSupport ) ;

returnMaps. set3 ( ( HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > ) associationRulesConfidence ) ;

return returnMaps ;

}

for ( TreeSet < Integer > nSet : nSets )

{

double supportPercent = getSupportPercent ( nSet, transactions ) ;

if ( supportPercent >= minSupportPercent ) // Add the item to vifCurrent if its support count is high enough

vifCurrent. put ( nSet, supportPercent ) ;

}

if ( ! vifCurrent. isEmpty ( ) ) // Without this line we'd be adding some empty sets.

verifiedFreqItemSets. put ( n, ( HashMap < TreeSet < Integer > , Double > ) vifCurrent ) ; // Add the n -> all valid sets of size n pair to the VIF map

Pair < HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > ,HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double >> associationRulesMaps = new Pair < HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > ,HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double >> ( getAssociationRules ( vifCurrent, minSupportPercent, minConfidence, transactions ) ) ;

// Split the maps into separate variables and all all previous entries.

associationRulesSupport. putAll ( associationRulesMaps. get1 ( ) ) ;

associationRulesConfidence. putAll ( associationRulesMaps. get2 ( ) ) ;

// Below, recursively generate the next itemsets by incrementing n by 1 and passing the current itemset in

returnMaps = getVerifiedItemSets ( vifCurrent, transactions, minSupportPercent, minConfidence, verifiedFreqItemSets, associationRulesSupport, associationRulesConfidence, ++ n ) ;

return returnMaps ;

}

private static Pair < HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > , HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double >> getAssociationRules ( Map < TreeSet < Integer > , Double > vifCurrent, double minSupportPercent, double minConfidence, ArrayList < TreeSet < Integer >> transactions )

{

Map < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > associationRulesSupport = new HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > ( ) ;

Map < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > associationRulesConfidence = new HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > ( ) ;

for ( TreeSet < Integer > key : vifCurrent. keySet ( ) ) // Iterate over a bunch of sets

{

List < ArrayList < Integer >> potentialRules = new ArrayList < ArrayList < Integer >> ( getAllPartitions ( key ) ) ;

for ( int i = 0 ; i < potentialRules. size ( ) - 1 ; i += 2 )

{

TreeSet < Integer > left = new TreeSet < Integer > ( potentialRules. get ( i ) ) ;

TreeSet < Integer > right = new TreeSet < Integer > ( potentialRules. get ( i + 1 ) ) ;

TreeSet < Integer > union = new TreeSet < Integer > ( left ) ;

union. addAll ( right ) ;

// Get all the support quantities and use it to calculate the confidence (see getConfidence).

int currentSupportCount = getSupportCount ( ( TreeSet < Integer > ) union, transactions ) ;

double currentSupportPercent = ( double ) currentSupportCount * 100 / ( double ) transactions. size ( ) ;

double currentConfidence = getConfidence ( left, right, currentSupportCount, transactions ) ;

// If the rule has high enough confidence and support percent, add it to the HashMaps

if ( currentConfidence >= minConfidence && currentSupportPercent >= minSupportPercent )

{

Map < TreeSet < Integer > , TreeSet < Integer >> approvedRule = new HashMap < TreeSet < Integer > , TreeSet < Integer >> ( ) ;

approvedRule. put ( left, right ) ; // This HashMap will be the key for our three-way HashMaps

associationRulesSupport. put ( ( HashMap < TreeSet < Integer > , TreeSet < Integer >> ) approvedRule, currentSupportPercent ) ;

associationRulesConfidence. put ( ( HashMap < TreeSet < Integer > , TreeSet < Integer >> ) approvedRule, currentConfidence ) ;

}

}

}

Pair < HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > , HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double >> returnPair = new Pair < HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > , HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double >> ( ) ;

returnPair. set1 ( ( HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > ) associationRulesSupport ) ;

returnPair. set2 ( ( HashMap < HashMap < TreeSet < Integer > , TreeSet < Integer >> , Double > ) associationRulesConfidence ) ;

return returnPair ;

}

// Generates confidences for a rule set with relation to the collection containing it.

private static Double getConfidence ( Set < Integer > left, Set < Integer > right, int unionSupportCount, ArrayList < TreeSet < Integer >> transactions )

{

Set < Integer > union = new TreeSet < Integer > ( left ) ;

union. addAll ( right ) ; // Generate the union of both sides of the rule

double confidence = ( double ) unionSupportCount * 100 / ( double ) getSupportCount ( ( TreeSet < Integer > ) left, transactions ) ;

return confidence ;

}

// Takes in a set and flattens it into a set of every unique item in all of the sets contained in the original set

private static Set < Integer > flattenSet ( Set < TreeSet < Integer >> itemSet )

{

Set < Integer > union = new TreeSet < Integer > ( ) ;

for ( TreeSet < Integer > set : itemSet )

union. addAll ( set ) ;

return union ;

}

// Takes in a collection of itemsets and generates all possible nonrepeating combinations of size n for those itemsets.

private static Set < TreeSet < Integer >> getAllNSets ( TreeSet < Integer > items, int n )

{

Set < TreeSet < Integer >> combList = new HashSet < TreeSet < Integer >> ( ) ;

ICombinatoricsVector < Integer > vec = Factory. createVector ( items ) ;

Generator < Integer > gen = Factory. createSimpleCombinationGenerator ( vec, n ) ; // Gives us all nonrepeating n-sets

for ( ICombinatoricsVector < Integer > comb : gen )

{ // Add the combination and its support.

// Note that we have to convert the vector to a list and then to a TreeSet.

combList. add ( new TreeSet < Integer > ( comb. getVector ( ) ) ) ;

}

return combList ;

}

// We return a List of Lists and will iterate through even indices soon.

// We do this instead of using a HashMap so we can have duplicate keys.

private static List < ArrayList < Integer >> getAllPartitions ( TreeSet < Integer > items )

{ // Use an ArrayList because it doesn't consider order.

List < ArrayList < Integer >> potentialSets = new ArrayList < ArrayList < Integer >> ( ) ;

Set < TreeSet < Integer >> allItemSubsets = new HashSet < TreeSet < Integer >> ( ) ;

for ( int i = 1 ; i < items. size ( ) ; i ++ ) // Generate all combinations of the given set.

allItemSubsets. addAll ( getAllNSets ( items, i ) ) ;

for ( int i = 2 ; i < allItemSubsets. size ( ) ; i ++ )

{

try // It kills me to do this but I couldn't get rid of out of bounds.

{

ICombinatoricsVector < Integer > vector = Factory. createVector ( items ) ;

Generator < ICombinatoricsVector < Integer >> gen = new ComplexCombinationGenerator < Integer > ( vector, i ) ;

for ( ICombinatoricsVector < ICombinatoricsVector < Integer >> comb : gen )

{

potentialSets. add ( ( ArrayList < Integer > ) comb. getValue ( 0 ) . getVector ( ) ) ;

potentialSets. add ( ( ArrayList < Integer > ) comb. getValue ( 1 ) . getVector ( ) ) ;

}

}

catch ( RuntimeException e )

{

break ;

}

}

return potentialSets ;

}

// Converts a String array to an Integer array.

private static Integer [ ] arrayToInteger ( String [ ] sArr ) throws Exception

{

Integer [ ] newArray = new Integer [ sArr. length ] ;

for ( int i = 0 ; i < sArr. length ; i ++ )

newArray [ i ] = Integer . parseInt ( sArr [ i ] ) ;

return newArray ;

}