In this small study, we use hierarchical clustering techniques to explore the structure of correlations between US stocks. To do so, we first download a dataset of adjusted close prices for the US stocks from Quandl.

import numpy as np import pandas as pd from datetime import datetime import matplotlib.pyplot as plt % matplotlib inline

df = pd . read_csv ( 'data/WIKI_PRICES.csv' )

df . head ()

ticker date open high low close volume ex-dividend split_ratio adj_open adj_high adj_low adj_close adj_volume 0 A 1999-11-18 45.50 50.00 40.00 44.00 44739900 0 1 31.105117 34.181447 27.345157 30.079673 44739900 1 A 1999-11-19 42.94 43.00 39.81 40.38 10897100 0 1 29.355027 29.396044 27.215268 27.604936 10897100 2 A 1999-11-22 41.31 44.00 40.06 44.00 4705200 0 1 28.240711 30.079673 27.386175 30.079673 4705200 3 A 1999-11-23 42.50 43.63 40.25 40.25 4274400 0 1 29.054230 29.826730 27.516065 27.516065 4274400 4 A 1999-11-24 40.13 41.94 40.00 41.06 3464400 0 1 27.434029 28.671398 27.345157 28.069804 3464400

#convert the dataframe built from the csv file to the right input dataframe format names = sorted ( set ( df [ 'ticker' ])) subdf = df [[ 'ticker' , 'date' , 'adj_close' ]] subdf . index = [ datetime . strptime ( d , ' % Y- % m- % d' ) for d in subdf [ 'date' ] . get_values ()] sorted_dates = sorted ( set ( subdf . index )) prices_df = pd . DataFrame ( columns = names , index = sorted_dates ) print ( "shape:" , prices_df . shape ) prices_df = subdf . pivot_table ( 'adj_close' , [ 'date' ], 'ticker' ) prices_df . index = pd . to_datetime ( prices_df . index , format = ' % Y- % m- % d' ) del prices_df . index . name del prices_df . columns . name prices_df . head ()

shape: (14105, 3190) There are in total 3190 stocks with more or less complete historical time series of prices.

A AA AAL AAMC AAN AAOI AAON AAP AAPL AAT ... ZIOP ZIXI ZLC ZLTQ ZMH ZNGA ZOES ZQK ZTS ZUMZ 1962-01-02 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1962-01-03 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1962-01-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1962-01-05 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1962-01-08 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5 rows × 3190 columns

We consider only the last 2000 trading days, i.e. starting in roughly 2010.

#consider a smaller dataframe: only the last 2000 trading days subperimeter_prices = prices_df [ prices_df . columns [:]][ - 2000 :] plt . figure ( figsize = ( 30 , 8 )) plt . plot ( subperimeter_prices ) plt . ylim ([ 0 , 1000 ]) plt . show ()

#convert prices to returns X = pd . DataFrame ( np . diff ( np . log ( subperimeter_prices ), axis = 0 )) X . index = subperimeter_prices . index [ 1 :] X . columns = subperimeter_prices . columns plt . figure ( figsize = ( 30 , 8 )) plt . plot ( X ) plt . show () X . shape

(1999, 3190)

We keep only stocks having enough history (95% of non-missing values).

from ClusterLib.clusterlib import * from ClusterLib.preprocessing import * min_percent = 95 nb_clusters = 75 X = delete_small_historical ( X , min_percent ) X . fillna ( value = 0 , inplace = True ) plt . figure ( figsize = ( 30 , 8 )) plt . plot ( X ) plt . show () X . shape , X . isnull () . values . any ()

((1999, 2068), False)

We compute the clusters using the Ward method working on distances based on Spearman correlation between stocks.

clusters , cluster_map = get_clusters ( X , "ward" , "spearman" , nb_clusters )

clusters

[array(['EXC', 'FE', 'AEE', 'CMS', 'DTE', 'LNT', 'PNW', 'SCG', 'ES', 'WEC', 'XEL', 'GXP', 'WR', 'ED', 'SO', 'AEP', 'DUK', 'D', 'NEE', 'EIX', 'PCG', 'ETR', 'PEG', 'PPL'], dtype=object), array(['AWK', 'WTR', 'HE', 'MGEE', 'OTTR', 'EE', 'PNM', 'POR', 'AVA', 'IDA', 'ALE', 'NWE', 'SJI', 'NJR', 'WGL', 'NWN', 'SWX', 'BKH', 'UGI', 'ATO', 'VVC', 'CPK', 'UTL', 'CPN', 'NRG', 'CNP', 'OGE', 'NI', 'SRE', 'AES', 'MDU', 'NFG'], dtype=object), array(['TSN', 'PPC', 'SAFM'], dtype=object), array(['PM', 'MO', 'RAI', 'UVV', 'VGR', 'SYY', 'HSY', 'CAG', 'MDLZ', 'SJM', 'HRL', 'GIS', 'K', 'CPB', 'MKC', 'DF', 'FLO', 'PG', 'CL', 'KMB', 'CHD', 'CLX', 'TAP', 'BF_B', 'STZ', 'MNST', 'CCE', 'DPS', 'KO', 'PEP'], dtype=object), array(['GPT', 'CDR', 'KRG', 'RPT', 'ADC', 'GTY', 'BFS', 'UBA', 'UHT', 'IRET', 'MNR', 'ALX', 'UMH', 'GOOD', 'OLP'], dtype=object), array(['SBRA', 'HR', 'OHI', 'SNH', 'VTR', 'HCN', 'HCP', 'MPW', 'LTC', 'NHI', 'CUBE', 'EXR', 'PSA', 'MAA', 'AIV', 'CPT', 'ESS', 'UDR', 'AVB', 'EQR', 'ACC', 'EDR', 'ELS', 'SUI', 'EQIX', 'DFT', 'DLR', 'CLI', 'OFC', 'SLG', 'BXP', 'VNO', 'ARE', 'DEI', 'KRC', 'WRE', 'BDN', 'DRE', 'HIW', 'CUZ', 'PKY', 'GOV', 'FPO', 'FR', 'PLD', 'DCT', 'EGP', 'LXP', 'FSP', 'PSB', 'EPR', 'O', 'GGP', 'DDR', 'FRT', 'SPG', 'AKR', 'REG', 'KIM', 'WRI', 'SKT', 'MAC', 'TCO', 'FCE_A', 'CBL', 'PEI'], dtype=object), array(['PZG', 'GORO', 'CDE', 'HL', 'GOLD', 'NEM', 'RGLD'], dtype=object), array(['ARI', 'STWD', 'CIM', 'PMT', 'RWT', 'RAS', 'RSO', 'AI', 'STAR', 'NYMT', 'ARR', 'CYS', 'AGNC', 'NLY', 'DX', 'IVR', 'MFA', 'ANH', 'CMO'], dtype=object), array(['BPTH', 'OLBK', 'MCBC', 'NATH', 'RVLT', 'FRBK', 'III'], dtype=object), array(['BRT', 'ARCW', 'LOV', 'SIF'], dtype=object), array(['CUR', 'CUTR', 'PTSI', 'BBGI', 'GAIA', 'LEE'], dtype=object), array(['VTNR', 'REI', 'TAT', 'CIDM', 'GALT', 'NNVC'], dtype=object), array(['INTX', 'WSTL', 'GTXI', 'OGXI', 'PRKR', 'PTX', 'NAVB', 'SNSS', 'OMEX', 'BDSI', 'RPRX'], dtype=object), array(['NETE', 'PCYO', 'TWMC', 'EGAN', 'UNXL', 'BIOL', 'NEON'], dtype=object), array(['MITK', 'GLUU', 'ZAGG', 'MEIP', 'NEO', 'PCYG', 'GFN', 'ISSC', 'STRT', 'PHMD', 'TEAR', 'CPSS', 'DAVE', 'TRXC', 'CALL', 'PERI'], dtype=object), array(['PCTI', 'BHB', 'HOFT', 'MOFG', 'AMNB', 'PGC', 'RDI', 'TIS', 'HBCP', 'NRIM', 'FFNW', 'WSBF', 'CUNB', 'PPBI', 'FBIZ', 'SGBK', 'GBNK', 'PFBC', 'LION', 'HBNC', 'MBWM', 'ANCX', 'FFKT'], dtype=object), array(['CFFI', 'DGAS', 'CASH', 'DJCO', 'SPNS', 'LCNB', 'BXC', 'PROV', 'HIFS', 'ITIC', 'CHMG', 'PLPM', 'CIX', 'LFVN'], dtype=object), array(['REIS', 'SALM', 'AE', 'UCFC', 'ACHC', 'ADUS', 'AXDX', 'BSET', 'FLXS', 'SNAK', 'TESS', 'MLAB', 'UTMD', 'CORR', 'HCCI', 'CLCT', 'PATK', 'MDCA', 'WETF', 'NLS', 'CSV', 'PGTI', 'CECE', 'TREC', 'SPA', 'UFPT'], dtype=object), array(['FNHC', 'HCI', 'UIHC', 'DXYN', 'ESCA', 'SQBG', 'DOOR', 'PFIS', 'CUI', 'PLUG', 'TREE', 'GTT', 'NYNY'], dtype=object), array(['GALE', 'IDRA', 'CYTR', 'INO', 'SGYP', 'THLD', 'ARWR', 'ACHN', 'AGEN', 'KERX', 'HRTX', 'INSY', 'MDXG', 'TXMD', 'FCSC', 'OHRP'], dtype=object), array(['LGND', 'LXRX', 'MNTA', 'ACOR', 'EBS', 'GHDX', 'SPPI', 'MDCO', 'RIGL', 'CRIS', 'EXEL', 'CYTK', 'SGMO', 'HALO', 'IMGN', 'CLDX', 'ALNY', 'NBIX', 'BCRX', 'GERN', 'NVAX', 'ZIOP', 'NKTR', 'PGNX', 'ARRY', 'IMMU', 'VNDA', 'DVAX', 'INFI', 'ANIP', 'OPK', 'SRPT', 'XOMA', 'VVUS', 'ARNA', 'OREX', 'MNKD', 'MYGN', 'EXAS', 'PDLI', 'OMER', 'BTX', 'CORT', 'BSTC', 'SCLN', 'ATRS', 'AMAG', 'CERS', 'BEAT', 'RMTI', 'AMRI', 'FOLD', 'ACAD', 'INSM', 'ANIK', 'RGEN'], dtype=object), array(['GILD', 'BIIB', 'AMGN', 'CELG', 'ILMN', 'UTHR', 'ALKS', 'INCY', 'SGEN', 'VRTX', 'REGN', 'ALXN', 'BMRN'], dtype=object), array(['PRGO', 'AGN', 'MYL', 'ENDP', 'VRX', 'DEPO', 'SCMP', 'LCI', 'AKRX', 'IPXL'], dtype=object), array(['STAA', 'OSUR', 'QDEL', 'CSII', 'ELGX', 'NXTM', 'ABMD', 'SPNC', 'DXCM', 'PODD', 'IVC', 'OFIX', 'ANGO', 'BABY', 'MMSI', 'NUVA', 'WMGI', 'ABAX', 'LMNX', 'HAE', 'ICUI', 'IART', 'MASI', 'CNMD', 'TFX', 'STE', 'WST', 'ENSG', 'USPH', 'ALOG', 'LDR', 'VIVO', 'BCPC', 'NEOG', 'OMCL', 'CRVL', 'CRY', 'CBM', 'EXAC', 'SRDX'], dtype=object), array(['AMED', 'AFAM', 'LHCG', 'HLS', 'CYH', 'LPNT', 'UHS', 'PMC', 'BKD', 'CSU', 'KND', 'SEM'], dtype=object), array(['HUM', 'CI', 'UNH', 'AET', 'ANTM', 'WCG', 'CNC', 'MOH', 'GTS', 'MGLN'], dtype=object), array(['BMY', 'MRK', 'PFE', 'LLY', 'ABT', 'JNJ'], dtype=object), array(['ABC', 'CAH', 'MCK', 'ESRX', 'DVA', 'DGX', 'LH', 'OMI', 'XRAY', 'HSIC', 'PDCO', 'WOOF', 'CHE', 'MD', 'BAX', 'MDT', 'SYK', 'ZBH', 'BCR', 'BDX', 'ISRG', 'BSX', 'HOLX', 'HRC', 'VAR', 'EW', 'RMD', 'BRKR', 'MTD', 'TMO', 'WAT', 'A', 'PKI', 'ALGN', 'COO', 'CRL', 'PRXL', 'QGEN', 'IDXX', 'BIO', 'TECH'], dtype=object), array(['SDRL', 'ATW', 'DO', 'RIG', 'PKD', 'BAS', 'KEG', 'RES', 'PES', 'UNT', 'HP', 'PTEN', 'SLB', 'BHI', 'HAL', 'FTI', 'NOV', 'DRQ', 'OII', 'OIS', 'SPN', 'CRR', 'BRS', 'CKH', 'HOS', 'TDW', 'MTRX', 'MDR', 'TESO', 'NR', 'HLX', 'TTI', 'WG', 'GEOS', 'IO'], dtype=object), array(['NBL', 'APC', 'APA', 'DVN', 'OXY', 'COP', 'MRO', 'HES', 'MUR', 'PDCE', 'XEC', 'CXO', 'EOG', 'PXD', 'EGN', 'NFX', 'BP', 'CVX', 'XOM', 'INT', 'OKE', 'WMB', 'COG', 'EQT', 'RRC', 'CHK', 'GPOR', 'EGY', 'HK', 'GST', 'CLNE', 'REN', 'REXX', 'XCO', 'PQ', 'NOG', 'CRK', 'SGY', 'WTI', 'BBG', 'DNR', 'SM', 'CRZO', 'CLR', 'WLL', 'AXAS', 'CPE', 'AREX', 'MCF'], dtype=object), array(['AWH', 'VR', 'ACGL', 'RE', 'RNR', 'AHL', 'AXS', 'WTM', 'AON', 'MMC', 'AJG', 'BRO', 'AFSI', 'XL', 'PGR', 'ALL', 'CB', 'CINF', 'MCY', 'ORI', 'PRA', 'THG', 'AFG', 'WRB', 'MKL', 'Y', 'ANAT', 'ERIE'], dtype=object), array(['SNV', 'MTB', 'PNC', 'BBT', 'USB', 'WFC', 'FHN', 'CMA', 'ZION', 'RF', 'HBAN', 'STI', 'FITB', 'KEY', 'COF', 'AXP', 'DFS', 'STT', 'BK', 'NTRS', 'GS', 'MS', 'C', 'BAC', 'JPM'], dtype=object), array(['BLK', 'EV', 'BEN', 'TROW', 'AMG', 'AMP', 'IVZ', 'LM', 'WDR', 'FII', 'SEIC', 'CNS', 'VRTS', 'LAZ', 'EVR', 'GHL', 'ITG', 'IBKR', 'PJC', 'RJF', 'SF', 'ETFC', 'AMTD', 'SCHW', 'NDAQ', 'CME', 'ICE', 'BRK_A', 'BRK_B', 'AFL', 'HIG', 'LNC', 'MET', 'PRU', 'PFG', 'TMK', 'UNM', 'AIZ', 'RGA', 'AIG', 'LUK', 'CNA', 'L'], dtype=object), array(['CPF', 'HAFC', 'BANR', 'UCBI', 'FBP', 'BPOP', 'OFG', 'TBBK', 'NFBK', 'ORIT', 'FRME', 'PFS', 'INDB', 'FFBC', 'UBSI', 'FFIN', 'FNB', 'CBU', 'NBTB', 'IBOC', 'TRMK', 'WSBC', 'CHCO', 'STBA', 'CHFC', 'PRK', 'FFIC', 'BRKL', 'DCOM', 'BHLB', 'ISBC', 'OZRK', 'HOMB', 'PNFP', 'WAL', 'BPFH', 'PACW', 'COLB', 'UMBF', 'ONB', 'UMPQ', 'CVBF', 'GBCI', 'WABC', 'WTFC', 'BXS', 'FCF', 'FMBI', 'MBFI', 'TCBI', 'IBKC', 'HBHC', 'BOKF', 'CFR', 'PB', 'SBNY', 'VLY', 'FULT', 'BOH', 'CBSH', 'SIVB', 'CATY', 'EWBC', 'WBS', 'AF', 'NYCB', 'PBCT', 'NWBI', 'WAFD', 'CFFN', 'TFSL'], dtype=object), array(['OB', 'ESGR', 'GLRE', 'MHLD', 'UFCS', 'IPCC', 'SAFT', 'AGII', 'NAVG', 'RLI', 'SIGI', 'AMSF', 'EIG', 'BOFI', 'UVE', 'BLX', 'OPY', 'FCNCA', 'HTH', 'HMN', 'KMPR', 'FFG', 'AEL', 'CNO', 'PZN', 'MKTX', 'BGCP', 'SFE'], dtype=object), array(['NICK', 'BANC', 'FBC', 'PSTB', 'SBCF', 'HTBK', 'SNBC', 'CNOB', 'FDEF', 'CHFN', 'CCF', 'EBTC', 'IHC'], dtype=object), array(['BMTC', 'FISI', 'WSFS', 'PCBK', 'EFSC', 'HFWA', 'ABCB', 'WTBA', 'BUSE', 'TOWN', 'SBSI', 'FBNC', 'AROW', 'GABC', 'GSBC', 'HTLF', 'UVSP', 'FCBC', 'LBAI', 'TCBK', 'STL', 'UBSH', 'TMP', 'BANF', 'SRCE', 'THFF', 'TRST', 'LKFN', 'SASR', 'SFNC', 'RNST', 'WASH', 'CTBI', 'SYBT', 'CAC', 'MSFG', 'BKMU', 'PEBO', 'UBNK', 'CSFL', 'EGBN', 'COBZ', 'FLIC', 'BSRR', 'OCFC', 'OKSB', 'RBCAA', 'ESSA', 'CSBK', 'EBSB', 'BNCL', 'KRNY', 'SENEA', 'NUTR', 'ODC', 'DGICA', 'HALL', 'EMCI', 'GBLI', 'STFC', 'CTO', 'CIA', 'NHC', 'CASS', 'WEYS', 'TBNK', 'BFIN', 'CZNC', 'BDGE', 'CCNE', 'FNLC', 'ATLO', 'BWINB', 'CCBG', 'PWOD', 'CNBKA', 'BMRC', 'NKSH'], dtype=object), array(['HTGC', 'MAIN', 'TCAP', 'TICC', 'PNNT', 'PSEC', 'AINV', 'ARCC', 'BKCC', 'FSC', 'CSWC', 'MVC', 'GAIN', 'GLAD'], dtype=object), array(['JAKK', 'WWE', 'FLWS', 'AVHI', 'TTGT', 'ENZ', 'OME', 'DMRC', 'SNMX', 'ARAY', 'ARQL', 'LWAY', 'ISRL', 'ATRI', 'NVEC', 'OFLX', 'USLM', 'HBIO', 'RTIX', 'CWCO', 'KMG', 'PRSC', 'CFI', 'HCKT', 'PLUS', 'WINA', 'NATR', 'JOUT', 'ALCO', 'LMNR', 'HNRG', 'AIQ', 'RDNT', 'HNH', 'PLPC', 'SGA', 'BDE', 'NMRX', 'SHLO', 'BREW', 'SAM', 'MPAA', 'ATRC', 'HSTM'], dtype=object), array(['CFNB', 'GRIF', 'GSIT', 'RELL', 'LINC', 'IVAC', 'QTM', 'MOSY', 'RBCN', 'CLUB', 'ATEC', 'AVNW', 'CVO', 'AHC', 'MNI', 'PRGX', 'RST', 'DWSN', 'MIND', 'AOI', 'FVE', 'MSL', 'RTK', 'JMBA', 'PCO', 'FHCO', 'PTIE', 'CTG', 'GSOL', 'HIL', 'SEAC', 'LUB', 'RSYS', 'CTIC', 'PPHM', 'BIOS', 'CPST', 'FCEL', 'XRM', 'UEC', 'URG', 'CYTX', 'VICL', 'HDNG', 'GMO', 'SPRT'], dtype=object), array(['ARTNA', 'YORW', 'CTWS', 'MSEX', 'SJW', 'AWR', 'CWT', 'IDT', 'CBB', 'CNSL', 'GNCMA', 'ATNI', 'SHEN', 'CALM', 'CORE', 'CVGW', 'BGS', 'THS', 'WDFC', 'LNCE', 'JJSF', 'LANC', 'TR', 'JBSS', 'FARM', 'COKE', 'FIZZ'], dtype=object), array(['LRN', 'BPI', 'CECO', 'CPLA', 'STRA', 'APEI', 'LOPE', 'GHC', 'UTI'], dtype=object), array(['HAIN', 'VRSK', 'ABCO', 'INWK', 'MED', 'NTRI', 'CHDN', 'ISCA', 'TRK', 'MTN', 'HELE', 'HSNI', 'NCMI', 'RGS', 'CENTA', 'PBH', 'UHAL', 'VRTU', 'EXLS', 'MMS', 'SYNT', 'CSGS', 'CVG', 'SYKE', 'TTEC', 'PRFT', 'LYV', 'CSGP', 'JCOM', 'LDOS', 'CACI', 'MANT', 'HCSG', 'CTAS', 'UNF', 'ABM', 'ROL', 'ACXM', 'HI', 'BCO', 'SCI', 'SSP', 'MDP', 'NYT', 'RRD', 'JW_A', 'SCHL', 'RECN', 'HSII', 'KFY', 'ASGN', 'DLX', 'KFRC', 'MAN', 'RHI', 'KELYA', 'TBI', 'FCN', 'HURN', 'TILE', 'KNL', 'SCS', 'HNI', 'MLHR', 'VOXX', 'NSR', 'OSIS', 'IRBT', 'UEIC'], dtype=object), array(['ERII', 'STRL', 'GLDD', 'ORN', 'LTS', 'COWN', 'ATRO', 'INTL', 'AGM', 'BBSI', 'CWST', 'CRMT', 'MG', 'CMTL', 'RNWK', 'ETM', 'PKOH', 'MYRG', 'GHM', 'TISI', 'CLH', 'ECOL', 'HURC', 'NNBR', 'SEB', 'SP', 'AMSWA', 'TYPE', 'MRLN', 'LBY', 'LCUT', 'CRD_B', 'KVHI', 'AGYS', 'CRAI', 'LABL', 'ICFI', 'FC', 'GPX', 'VICR', 'CDI', 'LYTS', 'HZO', 'LDL', 'ELY', 'SPAR', 'FORR', 'TRC', 'ARC', 'SYX', 'HHS', 'MPX', 'CCRN', 'MLR', 'NWLI', 'NC', 'WMAR', 'MCRI', 'HVT', 'NPK', 'MTSC', 'RAVN', 'DEL', 'FIX', 'BRC', 'MATW', 'EXPO', 'NCI', 'MYE', 'MCS', 'NSP', 'BELFB', 'BBOX', 'DAKT', 'AAON', 'MEI', 'FSS', 'ROG', 'CTS', 'PKE', 'LNDC', 'ACET', 'CSS', 'DHIL', 'IIIN', 'EBF', 'VVI', 'GBL', 'WHG', 'NEWS', 'PICO', 'UFI', 'MLNK', 'EXTR', 'VDSI', 'DCO', 'VSEC', 'CBZ', 'DGII', 'DSPG', 'ESIO', 'KOPN', 'COHU', 'IXYS', 'MRCY', 'XOXO'], dtype=object), array(['AZZ', 'POWL', 'AIMC', 'WIRE', 'AIN', 'BDC', 'ENS', 'NPO', 'CIR', 'SNHY', 'ASTE', 'EME', 'MLI', 'B', 'WTS', 'AIT', 'KAMN', 'MSA', 'FELE', 'ROLL', 'TTEK', 'BGG', 'ESE', 'CMCO', 'KAI', 'JBT', 'TNC', 'SXI', 'BMI', 'GRC', 'NNI', 'MOD', 'SUP', 'AYR', 'RUSHA', 'TRS', 'MGRC', 'MINI', 'DORM', 'SMP', 'ALG', 'ATSG', 'SRI', 'ACCO', 'MWA', 'AVD', 'FOE', 'SCL', 'SHLM', 'KWR', 'POL', 'FUL', 'MTX', 'SXT', 'OLN', 'KOP', 'TG', 'IPHS', 'HWKN', 'IOSP', 'OMN', 'ANDE', 'CCC', 'DAR', 'LXU'], dtype=object), array(['EPM', 'NGS', 'PHX', 'GIFI', 'HWCC', 'USAP', 'LAYN', 'AP', 'BOOM', 'CVGI', 'TWIN', 'ASCMA', 'NWPX', 'FSTR', 'PHIIK', 'EZPW', 'FCFS', 'WRLD', 'ECPG', 'PRAA', 'DDD', 'SSYS', 'AMSC', 'MCHX', 'ENOC', 'SPWR', 'CVA', 'ORA'], dtype=object), array(['MCD', 'SBUX', 'YUM', 'CMG', 'PNRA', 'TAST', 'BH', 'DENN', 'RUTH', 'DRI', 'EAT', 'CBRL', 'CAKE', 'TXRH', 'BOBE', 'DIN', 'RT', 'BJRI', 'BWLD', 'DPZ', 'PZZA', 'WEN', 'RRGB', 'JACK', 'SONC'], dtype=object), array(['SVU', 'KR', 'WFM', 'VLGEA', 'IMKTA', 'WMK', 'SPTN', 'UNFI'], dtype=object), array(['DEST', 'DXLG', 'BGFV', 'SCVL', 'SMRT', 'SSI', 'PSMT', 'MNRO', 'CASY', 'FRED', 'BKS', 'CTRN', 'KIRK', 'TUES', 'BBW', 'CONN', 'BONT', 'CBK', 'NWY', 'BEBE', 'RGR'], dtype=object), array(['COH', 'VFC', 'PVH', 'RL', 'HBI', 'NKE', 'UAA', 'SKX', 'CROX', 'DECK', 'SHOO', 'WWW', 'MOV', 'PERY', 'ICON', 'GIII', 'COLM', 'OXM'], dtype=object), array(['AAP', 'AZO', 'ORLY', 'TGT', 'COST', 'WMT', 'ROST', 'TJX', 'BIG', 'DLTR'], dtype=object), array(['ODP', 'SPLS', 'BBY', 'GME', 'AAN', 'RCII', 'PIR', 'BBBY', 'WSM', 'SBH', 'TSCO', 'ULTA', 'VSI', 'CAB', 'DKS', 'HIBB', 'FINL', 'FL', 'CRI', 'PLCE', 'GPS', 'LB', 'AEO', 'ANF', 'GES', 'ZUMZ', 'DSW', 'ASNA', 'CHS', 'BKE', 'CATO', 'GCO', 'SHLD', 'JCP', 'KSS', 'DDS', 'JWN', 'M'], dtype=object), array(['TRUE', 'AMD', 'MU', 'MRVL', 'NVDA', 'AMKR', 'KLAC', 'LRCX', 'AMAT', 'TER', 'CRUS', 'AVGO', 'SWKS', 'MXIM', 'XLNX', 'MCHP', 'ADI', 'TXN', 'CAVM', 'CY', 'DIOD', 'IDTI', 'MSCC', 'SLAB', 'SMTC', 'MPWR', 'POWI', 'PLAB', 'CCMP', 'AEIS', 'BRKS', 'ENTG', 'MKSI', 'UCTT', 'NANO', 'RTEC', 'ACLS', 'PDFS', 'SIGM', 'IMMR', 'SYNA', 'KEM', 'CEVA', 'CREE', 'VECO', 'FORM', 'RMBS', 'LSCC', 'XCRA'], dtype=object), array(['TTWO', 'ATVI', 'EA', 'NFLX', 'BIDU', 'EXPE', 'GOOGL', 'AMZN', 'PCLN', 'AKAM', 'VRSN', 'CHKP', 'DOX', 'CVLT', 'VMW', 'CRM', 'CTXS', 'RHT', 'CDNS', 'SNPS', 'ADSK', 'ANSS', 'ADBE', 'INTU', 'AZPN', 'NUAN', 'STX', 'WDC', 'HPQ', 'XRX', 'CTSH', 'ACN', 'IBM', 'CSCO', 'INTC', 'SYMC', 'MSFT', 'CA', 'ORCL', 'AAPL', 'MSI', 'EBAY', 'QCOM'], dtype=object), array(['OCLR', 'INFN', 'CIEN', 'FNSR', 'BRCD', 'NTAP', 'FFIV', 'JNPR', 'OLED', 'IPGP', 'COHR', 'IIVI', 'FARO', 'CGNX', 'LFUS', 'TTMI', 'JBL', 'SANM', 'BHE', 'PLXS', 'NSIT', 'SCSC', 'SNX', 'TECD', 'ARW', 'AVT', 'AVX', 'VSH', 'GRMN', 'ITRI', 'NATI', 'TRMB', 'FLIR', 'HRS', 'GLW', 'APH', 'TEL', 'SONS', 'SNCR', 'HLIT', 'NTCT', 'DLB', 'IDCC', 'ADTN', 'ARRS', 'VSAT', 'NTGR', 'PLT', 'DHX', 'MXWL', 'CRAY', 'SMCI', 'DBD', 'NCR', 'PAY', 'UIS', 'ZBRA', 'BID', 'EFII', 'WEX', 'DGI', 'LORL', 'SATS'], dtype=object), array(['CPSI', 'QSII', 'ATHN', 'CERN', 'MDRX', 'IRDM', 'ORBC', 'CCOI', 'LVLT', 'EGHT', 'VG', 'CLFD', 'QUIK', 'CAMP', 'WBMD', 'VRNT', 'PEGA', 'MSTR', 'MANH', 'TYL', 'ULTI', 'EGOV', 'EPAY', 'PRGS', 'ACIW', 'BLKB', 'CALD', 'GUID', 'MDSO', 'LOGM', 'PRO', 'STMP', 'OSTK', 'SHOR', 'EHTH', 'PETS', 'LQDT', 'CATM', 'BCOR', 'LPSN', 'EBIX', 'MGI', 'INAP', 'LLNW', 'CACC', 'ACTG', 'HMSY', 'WTW', 'SIRI', 'CETV', 'KTOS', 'VHC', 'SFLY', 'ZIXI', 'TIVO', 'TZOO'], dtype=object), array(['UNP', 'CSX', 'NSC', 'GWR', 'KSU', 'MSM', 'FAST', 'GWW', 'JCI', 'GE', 'DHR', 'HON', 'UTX', 'ITW', 'MMM', 'PCAR', 'CAT', 'CMI', 'IR', 'ETN', 'PH', 'EMR', 'ROK', 'WAB', 'AME', 'ROP', 'CFX', 'PNR', 'ITT', 'CR', 'DOV', 'FLS', 'RBC', 'ATU', 'KMT', 'TKR', 'NDSN', 'GGG', 'IEX', 'DCI', 'LECO'], dtype=object), array(['ACM', 'CBI', 'KBR', 'FLR', 'JEC', 'DY', 'MTZ', 'PWR', 'AGX', 'PRIM', 'NCS', 'FOR', 'BECN', 'AEGN', 'GVA', 'TPC', 'TITN', 'AGCO', 'DE', 'LNN', 'VMI', 'RAIL', 'ARII', 'GBX', 'TRN', 'CAR', 'HTZ', 'OSK', 'MTW', 'TEX', 'WNC', 'NAV', 'MTOR', 'WBC', 'TGH', 'HEES', 'URI', 'GTLS', 'TWI', 'DXPE', 'AXE', 'WCC', 'BGC', 'HSC'], dtype=object), array(['GPK', 'ATR', 'BMS', 'SON', 'AVY', 'SEE', 'GEF', 'SLGN', 'BLL', 'CCK', 'UFS', 'KS', 'IP', 'PKG', 'SWM', 'CLW', 'GLT', 'NP'], dtype=object), array(['IPI', 'CF', 'MOS', 'DD', 'DOW', 'HUN', 'CBT', 'FMC', 'CE', 'EMN', 'WLK', 'APD', 'PX', 'IFF', 'ECL', 'PPG', 'RPM', 'NEU', 'ASH', 'ALB', 'GRA', 'CMP', 'MON'], dtype=object), array(['CLF', 'AKS', 'X', 'ZEUS', 'SCHN', 'CMC', 'RS', 'NUE', 'STLD', 'CRS', 'WOR', 'FCX', 'SCCO', 'ARNC', 'CENX', 'GSM', 'KALU', 'HAYN', 'MTRN'], dtype=object), array(['HFC', 'TSO', 'VLO', 'CVI', 'ALJ', 'DK'], dtype=object), array(['NNA', 'DHT', 'FRO', 'NAT', 'TNK', 'PEIX', 'GPRE', 'REX', 'NM', 'SB', 'WLB', 'GLNG', 'SFL', 'TK', 'FTK', 'LNG', 'MNTX', 'KRO', 'NL'], dtype=object), array(['AAL', 'DAL', 'UAL', 'JBLU', 'ALK', 'LUV', 'ALGT', 'HA', 'SKYW'], dtype=object), array(['HTLD', 'KNX', 'WERN', 'ODFL', 'JBHT', 'LSTR', 'ARCB', 'SAIA', 'CGI', 'MRTN', 'KEX', 'MATX', 'FDX', 'UPS', 'CHRW', 'EXPD', 'AAWW', 'FWRD', 'HUBG', 'ECHO', 'XPO', 'USAK', 'YRCW'], dtype=object), array(['LMT', 'NOC', 'RTN', 'GD', 'LLL', 'SPR', 'BA', 'COL', 'HXL', 'TXT', 'TDG', 'TGI', 'AVAV', 'CUB', 'ESL', 'MOG_A', 'CW', 'TDY', 'AIR', 'HEI'], dtype=object), array(['CMLS', 'EVC', 'GTN', 'NXST', 'SBGI', 'CNK', 'RGC', 'SNI', 'DISCA', 'DISCK', 'DIS', 'FOX', 'FOXA', 'TWX', 'CBS', 'VIAB', 'DISH', 'CMCSA', 'LBTYA'], dtype=object), array(['SBAC', 'AMT', 'CCI', 'VOD', 'CTL', 'T', 'VZ', 'TDS', 'USM', 'S', 'TMUS'], dtype=object), array(['HRG', 'SPB', 'HLF', 'NUS', 'USNA', 'IPAR', 'REV', 'ADM', 'BG', 'FDP', 'INGR', 'CCL', 'RCL', 'CHH', 'MAR', 'WYN', 'IRM', 'LAMR', 'IPG', 'OMC', 'EL', 'TRI', 'SMG', 'NWL', 'TUP', 'CCO', 'CPA', 'STC', 'CLGX', 'FNF', 'ALR', 'HRB', 'RAD', 'CVS', 'WBA', 'SRCL', 'WCN', 'RSG', 'WM', 'MA', 'V', 'MORN', 'MCO', 'FDS', 'MSCI', 'EFX', 'FICO', 'IT', 'BR', 'DST', 'EEFT', 'MIDD', 'G', 'ADP', 'PAYX', 'FIS', 'FISV', 'JKHY', 'ADS', 'WU', 'GPN', 'TSS'], dtype=object), array(['NVR', 'MHO', 'PHM', 'TOL', 'DHI', 'LEN', 'KBH', 'MDC', 'MTH', 'BZH', 'HOV'], dtype=object), array(['KMX', 'AN', 'LAD', 'ABG', 'SAH', 'GPI', 'PAG', 'CTB', 'GT', 'THRM', 'F', 'GNTX', 'AXL', 'BWA', 'DAN', 'TEN'], dtype=object), array(['TREX', 'AMWD', 'BLDR', 'CVCO', 'APOG', 'ROCK', 'NX', 'GFF', 'SSD', 'UFPI', 'EXP', 'MLM', 'VMC', 'LPX', 'OC', 'USG', 'AWI', 'MAS', 'MHK', 'SHW', 'HD', 'LOW', 'CPRT', 'POOL', 'GPC', 'LKQ', 'WSO', 'AOS', 'LII', 'AYI', 'TTC', 'CSL', 'SNA', 'SWK', 'SCSS', 'TPX', 'ETH', 'LZB', 'LEG', 'WHR', 'HAS', 'MAT', 'HOG', 'BC', 'PII', 'SIG', 'TIF', 'LL', 'THO', 'WGO'], dtype=object), array(['AHT', 'FCH', 'RHP', 'HT', 'HPT', 'SHO', 'HST', 'DRH', 'LHO', 'CXW', 'GEO', 'WY', 'PCH', 'RYN', 'JOE', 'CBG', 'JLL', 'HF', 'KW'], dtype=object), array(['MGM', 'LVS', 'WYNN', 'PENN', 'BYD', 'PNK', 'IGT', 'SGMS'], dtype=object), array(['MTG', 'RDN', 'AGO', 'MBI', 'ASPS', 'OCN', 'PHH', 'WAC'], dtype=object)]

This can be quite puzzling without knowing the stocks. So we are going to study these clusters a bit further… Below we sort the rows and columns of the correlation matrix using the dendrogram computed by the Ward method. We can see that blocks appear on the diagonal. These blocks correspond to the clusters. Some are strongly correlated, some less so. Some are far (uncorrelated) from the rest of the stocks, some are mildly correlated with nearly every other stocks.

display_filtered_correlation ( X , nb_clusters = nb_clusters , method = "ward" , type_correl = "spearman" )

Now, we are trying to name automatically these clusters based on descriptive features for each stock. Here we will use only industry related information and rating, but one can be more imaginative to add various different factors.

#import features describing the stocks features = pd . read_csv ( 'stock_features.csv' ) features . index = features [ 'Unnamed: 0' ] del features . index . name main_features = features [[ 'INDUSTRY_SECTOR' , 'INDUSTRY_GROUP' , 'INDUSTRY_SUBGROUP' , 'RTG_SP_LT_LC_ISSUER_CREDIT' ]] . T main_features = main_features [ X . columns ] main_features . head ()

A AAL AAN AAON AAP AAPL AAWW ABAX ABC ABCB ... YRCW YUM ZAGG ZBH ZBRA ZEUS ZION ZIOP ZIXI ZUMZ INDUSTRY_SECTOR Industrial Consumer, Cyclical Consumer, Non-cyclical Industrial Consumer, Cyclical Technology Industrial Consumer, Non-cyclical Consumer, Non-cyclical Financial ... Industrial Consumer, Cyclical Industrial Consumer, Non-cyclical Industrial Industrial Financial Consumer, Non-cyclical Communications Consumer, Cyclical INDUSTRY_GROUP Electronics Airlines Commercial Services Building Materials Retail Computers Transportation Healthcare-Products Pharmaceuticals Banks ... Transportation Retail Electronics Healthcare-Products Machinery-Diversified Metal Fabricate/Hardware Banks Biotechnology Internet Retail INDUSTRY_SUBGROUP Electronic Measur Instr Airlines Rental Auto/Equipment Bldg Prod-Air&Heating Retail-Auto Parts Computers Transport-Air Freight Medical Instruments Medical-Whsle Drug Dist Commer Banks-Southern US ... Transport-Truck Retail-Restaurants Electronic Compo-Misc Medical Products Machinery-General Indust Metal Products-Distrib Commer Banks-Western US Medical-Biomedical/Gene Internet Security Retail-Sporting Goods RTG_SP_LT_LC_ISSUER_CREDIT BBB+ BB- NaN NaN BBB- AA+ NaN NaN A- NaN ... B- BB NaN BBB BB- NaN BBB- NaN NaN NaN 4 rows × 2067 columns

We compute the name for the clusters:

clusters , cluster_names = build_named_clustering ( X , main_features , nb_clusters )

We can see below that some clusters correspond very precisely to a given (sub)industry:

for cluster , cluster_name in zip ( clusters , cluster_names ): print ( cluster_name + " \t\t | size: " + str ( len ( cluster )) + " \t\t | mean correl: " + " %.2 f" % X [ cluster ] . corr ( method = "spearman" ) . values . mean ()) print ( cluster , end = "



" )