



> {-# LANGUAGE MultiParamTypeClasses,FlexibleInstances,FunctionalDependencies,GeneralizedNewtypeDeriving #-}



> module Main where



> import Data.HashTable

> import Data.List as L

> import Data.Int

> import Data.Array

> import GHC.Arr

> import qualified Data.Map as M

> import Control.Monad

> infixl 5 .+

> infixl 6 .*









> data I = A | B | C | D deriving (Eq,Ord,Show,Ix,Enum)









> c' (a,b,c,d) = hashString ("C" ++ show (a,b,c,d))





Memoisable

memo

c

c'

symmetrise

c (i,j,k,l) == c (j,i,l,k)

a4





> a4 (i,j,k,l) = [

> (i,j,k,l),

> (j,i,l,k),

> (k,l,i,j),

> (l,k,j,i),



> (i,k,l,j),

> (i,l,j,k),



> (k,j,l,i),

> (l,j,i,k),



> (j,l,k,i),

> (l,i,k,j),



> (j,k,i,l),

> (k,i,j,l)

> ]





symmetrise





> symmetrise group f x = sum (map f (group x))









> h = memo $ \a -> hashString ("H" ++ show a) .* return ()









> s2 (i,j) = [ (i,j), (j,i) ]

> o' (a,b) = hashString ("O" ++ show (a,b))

> o = memo $ \x -> symmetrise s2 o' x .* return ()





ijkl

i

ijkl

i

j

k

l





> bond :: V Int32 I

> bond = return A .+ return B .+ return C .+ return D





2





> h2 = simp $ do

> i <- bond

> h ! i

> h ! i





i <- bond

i

h2

4





> h_ :: V Int32 I

> h_ = do

> m <- bond

> h ! m

> return m





ch2_





> ch2_ = memo $ \i -> simp $ do

> k <- bond

> l <- bond

> m <- bond

> h ! l

> h ! m

> c ! (i,k,l,m)

> return k









> alkyl_ 0 = h_



> alkyl_ n = simp $ do

> i <- alkyl_ (n-1)

> ch2_ ! i









> alkane n = simp $ alkyl_ n >>= (h ! )

> methane = alkane 1









> oh = memo $ \i -> simp $ do

> j <- bond

> h ! j

> o ! (i,j)









> ethanol = simp $ do

> i <- alkyl_ 2

> oh ! i









> doubleBond :: V Int32 (I,I)

> doubleBond = simp $ do

> i <- bond

> j <- bond

> return (i,j)





symmetrise





> c_c = memo $ \(i,j,k,l) -> simp $ do

> (m,n) <- doubleBond

> c ! (i,j,m,n)

> c ! (m,n,k,l)









> ethene = simp $ do

> i <- h_

> j <- h_

> k <- h_

> l <- h_

> c_c ! (i,j,k,l)









> ch3_ = simp $ do

> l <- h_

> ch2_ ! l









> propene = simp $ do

> j <- ch3_

> i <- h_

> k <- h_

> l <- h_

> c_c ! (i,j,k,l)



> cisbut2ene = simp $ do

> i <- ch3_

> j <- h_

> k <- ch3_

> l <- h_

> c_c ! (i,j,k,l)



> transbut2ene = simp $ do

> i <- h_

> j <- ch3_

> k <- ch3_

> l <- h_

> c_c ! (i,j,k,l)



> cisbut2ene' = simp $ do

> i <- h_

> j <- ch3_

> k <- h_

> l <- ch3_

> c_c ! (i,j,k,l)



> _2methylpropene = simp $ do

> i <- ch3_

> j <- ch3_

> k <- h_

> l <- h_

> c_c ! (i,j,k,l)



> _2methylpropene' = simp $ do

> i <- h_

> j <- h_

> k <- ch3_

> l <- ch3_

> c_c ! (i,j,k,l)









> ring1 (p,q,r,s,t,u) = simp $ do

> i <- bond

> j <- bond

> k <- bond

> c_c ! (j,q,i,p)

> c_c ! (i,u,k,t)

> c_c ! (k,s,j,r)









> ring = memo $ \(p,q,r,s,t,u) -> simp $ ring1 (p,q,r,s,t,u) .+ ring1 (q,r,s,t,u,p)









> phenyl = memo $ \p -> simp $ do

> q <- h_

> r <- h_

> s <- h_

> t <- h_

> u <- h_

> ring ! (p,q,r,s,t,u)



> benzene :: V Int32 ()

> benzene = simp $ do

> i <- bond

> phenyl ! i

> h ! i



> phenol :: V Int32 ()

> phenol = simp $ do

> i <- bond

> phenyl ! i

> oh ! i



> toluene :: V Int32 ()

> toluene = simp $ do

> i <- ch3_

> phenyl ! i



> toluene' :: V Int32 ()

> toluene' = simp $ do

> p <- h_

> q <- h_

> r <- h_

> s <- ch3_

> t <- h_

> u <- h_

> ring ! (p,q,r,s,t,u)



> toluene'' :: V Int32 ()

> toluene'' = simp $ do

> p <- h_

> q <- h_

> r <- ch3_

> s <- h_

> t <- h_

> u <- h_

> ring ! (p,q,r,s,t,u)



> _1_2_dimethylbenzene = simp $ do

> p <- ch3_

> q <- ch3_

> r <- h_

> s <- h_

> t <- h_

> u <- h_

> ring ! (p,q,r,s,t,u)



> _1_3_dimethylbenzene = simp $ do

> p <- ch3_

> q <- h_

> r <- ch3_

> s <- h_

> t <- h_

> u <- h_

> ring ! (p,q,r,s,t,u)



> _1_4_dimethylbenzene = simp $ do

> p <- ch3_

> q <- h_

> r <- h_

> s <- ch3_

> t <- h_

> u <- h_

> ring ! (p,q,r,s,t,u)



> _1_5_dimethylbenzene = simp $ do

> p <- ch3_

> q <- h_

> r <- h_

> s <- h_

> t <- ch3_

> u <- h_

> ring ! (p,q,r,s,t,u)









> main = do

> print $ "toluene = " ++ show toluene

> print $ "toluene' = " ++ show toluene'

> print $ "toluene'' = " ++ show toluene''

> print $ "_1_2_dimethylbenzene = " ++ show _1_2_dimethylbenzene

> print $ "_1_3_dimethylbenzene = " ++ show _1_3_dimethylbenzene

> print $ "_1_4_dimethylbenzene = " ++ show _1_4_dimethylbenzene

> print $ "_1_5_dimethylbenzene = " ++ show _1_5_dimethylbenzene









> class Memoisable ix where

> memo :: (ix -> a) -> Array ix a



> instance Memoisable I where

> memo f = array (A,D) [(i,f i) | i <- [A .. D]]



> instance Memoisable (I,I) where

> memo f = array ((A,A),(D,D)) [(i,f i) |

> p <- [A .. D],

> q <- [A .. D],

> let i = (p,q) ]



> instance Memoisable (I,I,I,I) where

> memo f = array ((A,A,A,A),(D,D,D,D)) [(i,f i) |

> p <- [A .. D],

> q <- [A .. D],

> r <- [A .. D],

> s <- [A .. D],

> let i = (p,q,r,s) ]



> instance Memoisable (I,I,I,I,I,I) where

> memo f = array ((A,A,A,A,A,A),(D,D,D,D,D,D)) [(i,f i) |

> p <- [A .. D],

> q <- [A .. D],

> r <- [A .. D],

> s <- [A .. D],

> t <- [A .. D],

> u <- [A .. D],

> let i = (p,q,r,s,t,u) ]





Data.Array





> instance (Ix a1, Ix a2, Ix a3, Ix a4, Ix a5, Ix a6) => Ix (a1,a2,a3,a4,a5,a6) where

> range ((l1,l2,l3,l4,l5,l6),(u1,u2,u3,u4,u5,u6)) =

> [(i1,i2,i3,i4,i5,i6) | i1 <- range (l1,u1),

> i2 <- range (l2,u2),

> i3 <- range (l3,u3),

> i4 <- range (l4,u4),

> i5 <- range (l5,u5),

> i6 <- range (l6,u6)]



> unsafeIndex ((l1,l2,l3,l4,l5,l6),(u1,u2,u3,u4,u5,u6)) (i1,i2,i3,i4,i5,i6) =

> unsafeIndex (l6,u6) i6 + unsafeRangeSize (l6,u6) * (

> unsafeIndex (l5,u5) i5 + unsafeRangeSize (l5,u5) * (

> unsafeIndex (l4,u4) i4 + unsafeRangeSize (l4,u4) * (

> unsafeIndex (l3,u3) i3 + unsafeRangeSize (l3,u3) * (

> unsafeIndex (l2,u2) i2 + unsafeRangeSize (l2,u2) * (

> unsafeIndex (l1,u1) i1)))))



> inRange ((l1,l2,l3,l4,l5,l6),(u1,u2,u3,u4,u5,u6)) (i1,i2,i3,i4,i5,i6) =

> inRange (l1,u1) i1 && inRange (l2,u2) i2 &&

> inRange (l3,u3) i3 && inRange (l4,u4) i4 &&

> inRange (l5,u5) i5 && inRange (l6,u6) i6





Int32





> swap (x,y) = (y,x)



> class Num k => VectorSpace k v | v -> k where

> zero :: v

> (.+) :: v -> v -> v

> (.*) :: k -> v -> v

> (.-) :: v -> v -> v

> v1 .- v2 = v1 .+ ((-1).*v2)



> data V k a = V { unV :: [(k,a)] } deriving (Show)



> reduce x = filter ((/=0) . fst) $ fmap swap $ M.toList $ M.fromListWith (+) $ fmap swap $ x

> simp (V x) = V (reduce x)



> instance (Ord a,Num k) => Eq (V k a) where

> V x==V y = reduce x==reduce y



> instance (Ord a,Num k,Ord k) => Ord (V k a) where

> compare (V x) (V y) = compare (reduce x) (reduce y)



> instance Num k => Functor (V k) where

> fmap f (V as) = V $ map (\(k,a) -> (k,f a)) as



> instance Num k => Monad (V k) where

> return a = V [(1,a)]

> x >>= f = join (fmap f x)

> where join x = V $ concat $ fmap (uncurry scale) $ unV $ fmap unV x

> scale k1 as = map (\(k2,a) -> (k1*k2,a)) as



> instance Num r => MonadPlus (V r) where

> mzero = V []

> mplus (V x) (V y) = V (x++y)



> instance (Num k,Ord a) => VectorSpace k (V k a) where

> zero = V []

> V x .+ V y = V (x ++ y)

> (.*) k = (>>= (\a -> V [(k,a)]))



> e = return :: Num k => a -> V k a

> coefficient b (V bs) = maybe 0 id (L.lookup b (map swap (reduce bs)))





Twenty or so years ago I worked for a pharmaceutical company that had a large database of compounds. That got me thinking about the problem of how to perform lookups based on molecular structures. If you can find a bunch of numbers that encapsulate the molecular structure then you can use them as database keys. But you need to ensure that the same molecule entered in two different ways gets mapped to the same numbers, and you'd like different molecules, such as stereoisomers , or even enantiomers to get mapped to different values.That got me thinking and around then I came up with an idea for hashing molecules inspired by some of the mathematics I'd been doing not long before. I never got around to coding it up but twenty years later it dawned on me that I could easily do it using a similar technique to what I used for untangling tangles and translating trace diagrams . Anyway, as I've already given examples of how to translate diagrams to monadic expressions I'm going to skimp on the details in this post and just talk about things specific to molecular structures. For this post to make sense you have to understand those earlier posts.So first comes the usual Haskell preamble:I'm going to be using the vector space monad with a 4-dimensional vector space. This type labels the dimensions:n-valent atoms will be represented by functions that consume n-tuples. We'll start with a simple hash:My prime motivation for using Haskell for this problem is that the code was super-easy to write and investigate. But it's inefficient. I'll talk about how to remedy this properly at the end. But for now I'm going to memoise many functions as arrays using atype class with amethod. So I'll be usinginstead of> c = memo $ \x -> symmetrise a4 c' x .* return ()Note the use of thefunction. The idea is that the 4 bonds coming out from (singly-bonded) carbon can be thought of as lying at the corners of a tetrahedron.They have tetrahedral symmetry . So I'd like my hash to also have this symmetry so that, for example,. We can enforce this by summing over all 24 permutations of the arguments compatible with this symmetry, also known as A4. Solists all of these permutations:Andperforms the summation:We can define other molecules too. Hydrogen is easy:Oxygen has S2 symmetry:You can think of a carbon atom and a hydrogen atom, say, as a pair of arrays cand h, and bonding them together as summation over a shared index. So methane would be the sum over i,j,k,l = A..D of c. To this end, define a bond as:We can make Hlike so:makesa bond which we then attach to two hydrogen atoms. Evaluatingwill give us the hash for hydrogen gas.Rather than dive straight into CHwe can construct some useful building blocks. Hydrogen with a bond already attached:I'm using a trailing underscore _ to indicate a free bond.accepts one bond and returns another. Once memoised it is, in effect, just a 4x4 matrix and can be used to rapidly build chains.So here's code to make an alkyl chain with a free bind at the end.We can now make alkanes by attaching a hydrogen atom at the end and make methane as a special case:Now a hydroxyl group And you can have a drink on me:Carbon double bonds turn out to be straightforward. We can simply use a pair of bonds:Carbon-carbon double bonds tend not to twist and this is reflected in the hash. We'd have to applyif we wanted twistable bonds.We can make a pre-canned doubly bonded carbon atom pair:So we can build ethene like thisHere's a methyl group with a free bondSo we can build a bunch more compoundsAn interesting problem is building a benzene ring. Here's a first attempt with six free bonds:The problem with that is that benzene rings are special. The electrons are 'delocalised' so that the ring has rotational symmetry. We need to sum over the two consistent ways to assign single and double bonds around the ring. For the more general case of interlocking benzene rings we must still sum over all consistent assignments.And some more compunds:As we might hope, the three different ways to define toluene give the same result. We also discover that 1,3- and 1,5-dimethylbenzene are the same compound (or at least probably are, this isn't a perfect hash ).Now I need to say something about performance. The above code is naive and performs many unnecessary summations. For example, hashing a long chain should only take time linear in its length. But using the above code indiscriminately could give you exponential time. A good implementation might take a divide and conquer approach: slice the molecule in half through a bunch of bonds, compute partial hashes for each half and then sew the halves together in time exponential in the number of bonds you sliced through. For the types of molecules I've seen in real pharmaceutical databases (say) this is actually pretty cheap if you're smart about the slicing. The hashes in the above code could probably be computed many thousands of times faster. As it is, you'll probably need to compile the above with optimisation.I'm willing to bet that with small changes, and with suitable choice of matrices over the reals, we can get invariants of molecules that predict physical properties. These calulations are reminiscent of algorithms for various types of counting algorithm so at the very least they probably compute things that are meaningful from a statistical mechanics perspective.Incidentally, this approach to stitching together atoms was inspired by an old paper by R. C. Penner on fatgraphs - nothing to do with chemistry. A few days ago he put a paper online about an application to modelling proteins Update: Since writing this I've found a name for what I'm doing. I'm converting a chemical structure diagram into a tensor network . There seems to be lots of literature on how to evaluate these efficiently. In effect, everything I've been doing in this blog in terms of converting diagrams to code is an example of evaluating a tensor network.Now comes the memoisation class:Missing instances fromAnd the same vector space monad I've used many times before. Strictly speaking, it's more like a semiring module monad asisn't a field.