1

"""

2

A sample script/module which demonstrates how to count hapaxes (tokens which

3

appear only once) in an untagged text corpus using plain python, NLTK, and

4

spaCy. It counts and lists hapaxes in five different ways:

5

6

* Wordforms - counts unique spellings (normalized for case). This uses

7

plain Python (no NLTK required)

8

9

* NLTK stems - counts unique stems using a stemmer provided by NLTK

10

11

* NLTK lemmas - counts unique lemma forms using NLTK's part of speech

12

* tagger and interface to the WordNet lemmatizer.

13

14

* spaCy lemmas - counts unique lemma forms using the spaCy NLP module.

15

16

* Pattern lemmas - counts unique lemma forms using the Pattern NLP module.

17

18

All of the NLP modules (nltk, spaCy, Pattern) are optional; if any is not

19

installed then its respective hapax-counting method will not be run.

20

21

Usage:

22

23

python hapaxes.py [file]

24

25

If 'file' is given, its contents are read and used as the text in which to

26

find hapaxes. If 'file' is omitted, then a test text will be used.

27

28

Example:

29

30

Running this script with no arguments:

31

32

python hapaxes.py

33

34

Will process this text:

35

36

Cory Linguist, a cautious corpus linguist, in creating a corpus of

37

courtship correspondence, corrupted a crucial link. Now, if Cory Linguist,

38

a careful corpus linguist, in creating a corpus of courtship

39

correspondence, corrupted a crucial link, see that YOU, in creating a

40

corpus of courtship correspondence, corrupt not a crucial link.

41

42

And produce this output:

43

44

Count

45

Wordforms 9

46

Stems 8

47

Lemmas 8

48

spaCy 8

49

Pattern 8

50

51

-- Hapaxes --

52

Wordforms: careful, cautious, corrupt, if, not, now, see, that, you

53

NLTK-stems: care, cautious, if, not, now, see, that, you

54

NLTK-lemmas: care, cautious, if, not, now, see, that, you

55

spaCy: careful, cautious, if, not, now, see, that, you

56

Pattern: careful, cautious, if, not, now, see, that, you

57

58

59

Notice that the stems and lemmas methods do not count "corrupt" as a hapax

60

because it also occurs as "corrupted". Notice also that "Linguist" is not

61

counted as the text is normalized for case.

62

63

See also the Wikipedia entry on "Hapex legomenon"

64

(https://en.wikipedia.org/wiki/Hapax_legomenon)

65

"""

66

67

68

69

70

from

__future__

import

print_function

71

from

__future__

import

unicode_literals

72

73

74

import

sys

75

76

77

78

from

collections

import

Counter

79

80

81

import

re

82

83

try

84

85

import

nltk

86

87

88

89

from

nltk.stem.snowball

import

SnowballStemmer

90

91

92

from

nltk.stem.wordnet

import

WordNetLemmatizer

93

except

ImportError

94

None

95

" NLTK is not installed, so we won't use it. "

96

97

try

98

99

import

spacy

100

except

ImportError

101

None

102

" spaCy is not installed, so we won't use it. "

103

104

try

105

106

from

pattern.en

import

parse

107

except

ImportError

108

None

109

" Pattern is not installed, so we won't use it. "

110

111

def

normalize_tokenize

112

"""

113

Takes a string, normalizes it (makes it lowercase and

114

removes punctuation), and then splits it into a list of

115

words.

116

117

Note that everything in this function is plain Python

118

without using NLTK (although as noted below, NLTK provides

119

some more sophisticated tokenizers we could have used).

120

"""

121

122

123

124

125

r ' (?u)[^ \w \s ] '

' '

126

127

128

129

130

return

131

132

def

word_form_hapaxes

133

"""

134

Takes a list of tokens and returns a list of the

135

wordform hapaxes (those wordforms that only appear once)

136

137

For wordforms this is simple enough to do in plain

138

Python without an NLP package, especially using the Counter

139

type from the collections module (part of the Python

140

standard library).

141

"""

142

143

144

for

in

if

1

145

146

return

147

148

def

nltk_stem_hapaxes

149

"""

150

Takes a list of tokens and returns a list of the word

151

stem hapaxes.

152

"""

153

if

not

154

155

return

None

156

157

158

" english "

159

for

in

160

161

162

163

164

return

165

166

def

nltk_lemma_hapaxes

167

"""

168

Takes a list of tokens and returns a list of the lemma

169

hapaxes.

170

"""

171

if

not

172

173

return

None

174

175

176

177

178

179

180

for

in

181

182

183

184

185

for

in

186

187

return

188

189

def

pt_to_wn

190

"""

191

Takes a Penn Treebank tag and converts it to an

192

appropriate WordNet equivalent for lemmatization.

193

194

A list of Penn Treebank tags is available at:

195

https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

196

"""

197

198

from

nltk.corpus.reader.wordnet

import

NOUN

VERB

ADJ

ADV

199

200

201

202

if

' jj '

203

204

elif

' md '

205

206

207

elif

' rb '

208

209

elif

' vb '

210

211

elif

' wrb '

212

213

214

else

215

216

217

218

219

220

return

221

222

def

spacy_hapaxes

223

"""

224

Takes plain text and returns a list of lemma hapaxes using

225

the spaCy NLP package.

226

"""

227

if

not

228

229

return

None

230

231

232

' en '

233

234

235

236

237

for

in

238

if

not

and

not

239

240

241

242

243

244

for

in

if

1

245

return

246

247

def

pattern_hapaxes

248

"""

249

Takes plain text and returns a list of lemma hapaxes

250

using the Pattern NLP module.

251

"""

252

if

not

253

254

return

None

255

256

True

257

False

False

258

259

260

261

262

263

for

in

264

for

in

265

1

266

267

268

269

270

271

for

in

if

1

272

return

273

274

if

" __main__ "

275

"""

276

The code in this block is run when this file is executed as a script (but

277

not if it is imported as a module by another Python script).

278

"""

279

280

281

""" Cory Linguist, a cautious corpus linguist, in creating a

282

corpus of courtship correspondence, corrupted a crucial link. Now, if Cory

283

Linguist, a careful corpus linguist, in creating a corpus of courtship

284

correspondence, corrupted a crucial link, see that YOU, in creating a

285

corpus of courtship correspondence, corrupt not a crucial link. """

286

287

if

len

1

288

289

290

with

open

1

' r '

as

file

291

file

292

try

293

294

unicode

295

except

296

297

298

pass

299

300

301

302

303

304

305

306

307

308

" "

309

310

311

" Wordforms "

312

313

314

315

if

316

" NLTK-stems "

" NLTK-lemmas "

317

318

319

320

if

321

" spaCy "

322

323

324

325

if

326

" Pattern "

327

328

329

330

for

in

331

332

333

zip

334

" {:>14}{:^8} "

335

"

"

336

" "

" Count "

337

338

for

in

339

0

len

1

340

" {:<14}{:<68} "

0

" : "

" , "

1

341

342

"

-- Hapaxes -- "

343

for

in

344

345

"

"

346