#abstracts.py
#
#  by Joe Hahn, jhahn@spacescience.org, 29 January 2014.
#
#  Train a naive Bayes text classifier to predict the subject of abstracts from various astronomy journal
#  papers. The abstracts of these papers were gathered from the NASA ADS abstract service at
#  http://adsabs.harvard.edu/abstract_service.html (for example, enter "asteroids" into the
#  Abstract Words/Keywords box and set the "All refereed articles" and "ProCite format" buttons
#  then click "Send Query"). But in this exercise I only gather abstracts of papers on
#  three topics: asteroids, comets, and galaxies. The following reads those abstracts
#  from three input files (asteroids.txt etc), extracts and cleans up the text,
#  and then splits the abstracts into the usual training and testing sets. The code then
#  runs the trained text classifier on three additional abstracts that are included in this code,
#  and their subjects are asteroids, comets, and (in an attempt to confuse the classifier) an abstract on Mars.

#to execute in ipython:		> ipython --pylab	In [1]: %run abstracts.py

#import modules used below
import numpy as np
import re
#imports for Naive Bayes classifier
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from scipy.stats import sem

#read and extract abstracts from files asteroids.txt, comets.txt, etc
subject_names = ['asteroids', 'comets', 'galaxies']
abstracts = []
subjects = []
for i in range(len(subject_names)):
    file = open(subject_names[i]+'.txt', 'r')    
    text = file.read()
    file.close()
    begin = [s.start() for s in re.finditer('N2  - ', text)]
    end = [s.start() for s in re.finditer('DO  - ', text)]
    for j in range(len(begin)):
        a = text[begin[j]+6:end[j]].replace('\n', '')
        a = unicode(a, errors='replace')
        abstracts.append(a)
        subjects.append(subject_names[i])

#split data into training and test sets
test_fraction = 0.25
x_train, x_test, y_train, y_test = train_test_split(abstracts, subjects, test_size=test_fraction, random_state=2)

#set up the classifier
def evaluate_cross_validation(clf, x, y, K):
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    scores = cross_val_score(clf, x, y, cv=cv)
    print scores
    print ("mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores))

#score various combinations of text vectorizes and classifier schemes, onces that don't perform as well are commented out
#clf = Pipeline([ ('vect', CountVectorizer()),  ('clf', MultinomialNB()) ])
#evaluate_cross_validation(clf, x_train, y_train, 5)
#clf = Pipeline([ ('vect', HashingVectorizer(non_negative=True)),  ('clf', MultinomialNB()) ])
#evaluate_cross_validation(clf, x_train, y_train, 5)
#clf = Pipeline([ ('vect', TfidfVectorizer()),  ('clf', MultinomialNB()) ])
#evaluate_cross_validation(clf, x_train, y_train, 5)
#clf = Pipeline([ ('vect', TfidfVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")), ('clf', MultinomialNB())])
#evaluate_cross_validation(clf, x_train, y_train, 5)
#clf = Pipeline([ ('vect', CountVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")), ('clf', MultinomialNB()) ])
#evaluate_cross_validation(clf, x_train, y_train, 5)
#clf = Pipeline([ ('vect', CountVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")), ('clf', MultinomialNB(alpha=5.0)) ])
#evaluate_cross_validation(clf, x_train, y_train, 5)
#clf = Pipeline([ ('vect', CountVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")), ('clf', MultinomialNB(alpha=1.0)) ])
#evaluate_cross_validation(clf, x_train, y_train, 5)
#clf = Pipeline([ ('vect', CountVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")), ('clf', MultinomialNB(alpha=0.05)) ])
#evaluate_cross_validation(clf, x_train, y_train, 5)
clf = Pipeline([ ('vect', CountVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")), ('clf', MultinomialNB(alpha=0.2)) ])
evaluate_cross_validation(clf, x_train, y_train, 5)
print
print

#train the clf classifier and asses its accuracy
clf.fit(x_train, y_train)
print 'accuracy on training set = ', clf.score(x_train, y_train)
print 'accuracy on testing  set = ', clf.score(x_test, y_test)
y_pred = clf.predict(x_test)
print metrics.classification_report(y_test, y_pred)
print 'confusion matrix'
print metrics.confusion_matrix(y_test, y_pred)
print
print

#see if the trained classifier can recognize this abstract on asteroids
abstrct = [unicode("""We present preliminary diameters and albedos for 13511 Main Belt 
asteroids (MBAs) that were observed during the 3-Band Cryo phase of the 
Wide-field Infrared Survey Explorer (WISE; after the outer cryogen tank 
was exhausted) and as part of the NEOWISE Post-Cryo Survey (after the 
inner cryogen tank was exhausted). With a reduced or complete loss of 
sensitivity in the two long wavelength channels of WISE, the uncertainty 
in our fitted diameters and albedos is increased to ~20% for diameter 
and ~40% for albedo. Diameter fits using only the 3.4 and 4.6 mum 
channels are shown to be dependent on the literature optical H absolute 
magnitudes. These data allow us to increase the number of size estimates 
for large MBAs which have been identified as members of dynamical 
families. We present thermal fits for 14 asteroids previously identified 
as the parents of a dynamical family that were not observed during the 
fully cryogenic mission.""".replace('\n', ''), errors='replace')]
print abstrct[0][0:500] + '...'
prob = clf.predict_proba(abstrct)[0]
for j in range(len(prob)): print 'probability subject is ' + subject_names[j] + ' is ', prob[j] 
print '\033[1mabstract subject = ', clf.predict(abstrct)[0], '\033[0m\n'

#see if the classifier can recognize this abstract on comets
abstrct = [unicode("""Methyl formate is a complex organic molecule considered potentially 
relevant as precursor of biologically active molecules. It has been 
observed in several astrophysical environments, such as hot cores, hot 
corinos, and comets. The processes that drive the formation of molecules 
in cometary ices are poorly understood. In particular it is not yet 
clear if molecules are directly accreted from the pre-solar nebula to 
form comets or are formed after accretion. The present work analyzes the 
possible role of cosmic ion irradiation and radioactive decay in methyl 
formate formation in methanol-bearing ices. The results indicate that 
cosmic ion irradiation can account for about 12% of the methyl formate 
observed in comet Hale-Bopp, while radioactive decay can account for 
about 6% of this amount. The need of new data coming from earth based 
and space observational projects as well as from laboratory experiments 
is outlined.""".replace('\n', ''), errors='replace')]
print abstrct[0][0:500] + '...'
prob = clf.predict_proba(abstrct)[0]
for j in range(len(prob)): print 'probability subject is ' + subject_names[j] + ' is ', prob[j] 
print '\033[1mabstract subject = ', clf.predict(abstrct)[0], '\033[0m\n'

#lets see how an abstract on Mars gets classified
abstrct = [unicode("""The hydrosphere of Mars has remained mostly concealed within the 
subsurface for the past ~3.5 Gyr. Localized rupturing of the 
permafrost-capped crust led to voluminous groundwater discharges that 
carved some of the largest known channels in the solar system. However, 
our knowledge of the nature of the flows and their ultimate fate remains 
incomplete, partly because diagnostic landforms at outflow channel 
termini have been largely destroyed or buried. The Hebrus Valles outflow 
channels were excavated by fluid discharges that emanated from two point 
sources, and they mostly terminate in systems of fractures and 
depressions within the northern plains. Our investigation indicates that 
outflow channel floodwaters were captured and reabsorbed into the 
subsurface in zones where caverns developed within the northern plains. 
These findings imply that the study region comprises the only known 
location in the Martian northern lowlands where the fate of outflow 
channel discharges can be assessed with confidence. We propose that 
evacuation of subsurface materials via mud volcanism was an important 
process in cavern formation. Our conceptual model provides a hypothesis 
to account for the fate of sediments and fluids from some of the Martian 
outflow channels. It also reveals a mechanism for lowland cavern 
formation and upper crustal volatile enrichment after the development of 
the Martian global cryosphere.""".replace('\n', ''), errors='replace')]
print abstrct[0][0:500] + '...'
prob = clf.predict_proba(abstrct)[0]
for j in range(len(prob)): print 'probability subject is ' + subject_names[j] + ' is ', prob[j] 
print '\033[1mabstract subject = ', clf.predict(abstrct)[0], '\033[0m\n'