#abstracts.py # # by Joe Hahn, jhahn@spacescience.org, 29 January 2014. # # Train a naive Bayes text classifier to predict the subject of abstracts from various astronomy journal # papers. The abstracts of these papers were gathered from the NASA ADS abstract service at # http://adsabs.harvard.edu/abstract_service.html (for example, enter "asteroids" into the # Abstract Words/Keywords box and set the "All refereed articles" and "ProCite format" buttons # then click "Send Query"). But in this exercise I only gather abstracts of papers on # three topics: asteroids, comets, and galaxies. The following reads those abstracts # from three input files (asteroids.txt etc), extracts and cleans up the text, # and then splits the abstracts into the usual training and testing sets. The code then # runs the trained text classifier on three additional abstracts that are included in this code, # and their subjects are asteroids, comets, and (in an attempt to confuse the classifier) an abstract on Mars. #to execute in ipython: > ipython --pylab In [1]: %run abstracts.py #import modules used below import numpy as np import re #imports for Naive Bayes classifier from sklearn import metrics from sklearn.cross_validation import train_test_split from sklearn.cross_validation import cross_val_score, KFold from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from scipy.stats import sem #read and extract abstracts from files asteroids.txt, comets.txt, etc subject_names = ['asteroids', 'comets', 'galaxies'] abstracts = [] subjects = [] for i in range(len(subject_names)): file = open(subject_names[i]+'.txt', 'r') text = file.read() file.close() begin = [s.start() for s in re.finditer('N2 - ', text)] end = [s.start() for s in re.finditer('DO - ', text)] for j in range(len(begin)): a = text[begin[j]+6:end[j]].replace('\n', '') a = unicode(a, errors='replace') abstracts.append(a) subjects.append(subject_names[i]) #split data into training and test sets test_fraction = 0.25 x_train, x_test, y_train, y_test = train_test_split(abstracts, subjects, test_size=test_fraction, random_state=2) #set up the classifier def evaluate_cross_validation(clf, x, y, K): cv = KFold(len(y), K, shuffle=True, random_state=0) scores = cross_val_score(clf, x, y, cv=cv) print scores print ("mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores)) #score various combinations of text vectorizes and classifier schemes, onces that don't perform as well are commented out #clf = Pipeline([ ('vect', CountVectorizer()), ('clf', MultinomialNB()) ]) #evaluate_cross_validation(clf, x_train, y_train, 5) #clf = Pipeline([ ('vect', HashingVectorizer(non_negative=True)), ('clf', MultinomialNB()) ]) #evaluate_cross_validation(clf, x_train, y_train, 5) #clf = Pipeline([ ('vect', TfidfVectorizer()), ('clf', MultinomialNB()) ]) #evaluate_cross_validation(clf, x_train, y_train, 5) #clf = Pipeline([ ('vect', TfidfVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")), ('clf', MultinomialNB())]) #evaluate_cross_validation(clf, x_train, y_train, 5) #clf = Pipeline([ ('vect', CountVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")), ('clf', MultinomialNB()) ]) #evaluate_cross_validation(clf, x_train, y_train, 5) #clf = Pipeline([ ('vect', CountVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")), ('clf', MultinomialNB(alpha=5.0)) ]) #evaluate_cross_validation(clf, x_train, y_train, 5) #clf = Pipeline([ ('vect', CountVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")), ('clf', MultinomialNB(alpha=1.0)) ]) #evaluate_cross_validation(clf, x_train, y_train, 5) #clf = Pipeline([ ('vect', CountVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")), ('clf', MultinomialNB(alpha=0.05)) ]) #evaluate_cross_validation(clf, x_train, y_train, 5) clf = Pipeline([ ('vect', CountVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")), ('clf', MultinomialNB(alpha=0.2)) ]) evaluate_cross_validation(clf, x_train, y_train, 5) print print #train the clf classifier and asses its accuracy clf.fit(x_train, y_train) print 'accuracy on training set = ', clf.score(x_train, y_train) print 'accuracy on testing set = ', clf.score(x_test, y_test) y_pred = clf.predict(x_test) print metrics.classification_report(y_test, y_pred) print 'confusion matrix' print metrics.confusion_matrix(y_test, y_pred) print print #see if the trained classifier can recognize this abstract on asteroids abstrct = [unicode("""We present preliminary diameters and albedos for 13511 Main Belt asteroids (MBAs) that were observed during the 3-Band Cryo phase of the Wide-field Infrared Survey Explorer (WISE; after the outer cryogen tank was exhausted) and as part of the NEOWISE Post-Cryo Survey (after the inner cryogen tank was exhausted). With a reduced or complete loss of sensitivity in the two long wavelength channels of WISE, the uncertainty in our fitted diameters and albedos is increased to ~20% for diameter and ~40% for albedo. Diameter fits using only the 3.4 and 4.6 mum channels are shown to be dependent on the literature optical H absolute magnitudes. These data allow us to increase the number of size estimates for large MBAs which have been identified as members of dynamical families. We present thermal fits for 14 asteroids previously identified as the parents of a dynamical family that were not observed during the fully cryogenic mission.""".replace('\n', ''), errors='replace')] print abstrct[0][0:500] + '...' prob = clf.predict_proba(abstrct)[0] for j in range(len(prob)): print 'probability subject is ' + subject_names[j] + ' is ', prob[j] print '\033[1mabstract subject = ', clf.predict(abstrct)[0], '\033[0m\n' #see if the classifier can recognize this abstract on comets abstrct = [unicode("""Methyl formate is a complex organic molecule considered potentially relevant as precursor of biologically active molecules. It has been observed in several astrophysical environments, such as hot cores, hot corinos, and comets. The processes that drive the formation of molecules in cometary ices are poorly understood. In particular it is not yet clear if molecules are directly accreted from the pre-solar nebula to form comets or are formed after accretion. The present work analyzes the possible role of cosmic ion irradiation and radioactive decay in methyl formate formation in methanol-bearing ices. The results indicate that cosmic ion irradiation can account for about 12% of the methyl formate observed in comet Hale-Bopp, while radioactive decay can account for about 6% of this amount. The need of new data coming from earth based and space observational projects as well as from laboratory experiments is outlined.""".replace('\n', ''), errors='replace')] print abstrct[0][0:500] + '...' prob = clf.predict_proba(abstrct)[0] for j in range(len(prob)): print 'probability subject is ' + subject_names[j] + ' is ', prob[j] print '\033[1mabstract subject = ', clf.predict(abstrct)[0], '\033[0m\n' #lets see how an abstract on Mars gets classified abstrct = [unicode("""The hydrosphere of Mars has remained mostly concealed within the subsurface for the past ~3.5 Gyr. Localized rupturing of the permafrost-capped crust led to voluminous groundwater discharges that carved some of the largest known channels in the solar system. However, our knowledge of the nature of the flows and their ultimate fate remains incomplete, partly because diagnostic landforms at outflow channel termini have been largely destroyed or buried. The Hebrus Valles outflow channels were excavated by fluid discharges that emanated from two point sources, and they mostly terminate in systems of fractures and depressions within the northern plains. Our investigation indicates that outflow channel floodwaters were captured and reabsorbed into the subsurface in zones where caverns developed within the northern plains. These findings imply that the study region comprises the only known location in the Martian northern lowlands where the fate of outflow channel discharges can be assessed with confidence. We propose that evacuation of subsurface materials via mud volcanism was an important process in cavern formation. Our conceptual model provides a hypothesis to account for the fate of sediments and fluids from some of the Martian outflow channels. It also reveals a mechanism for lowland cavern formation and upper crustal volatile enrichment after the development of the Martian global cryosphere.""".replace('\n', ''), errors='replace')] print abstrct[0][0:500] + '...' prob = clf.predict_proba(abstrct)[0] for j in range(len(prob)): print 'probability subject is ' + subject_names[j] + ' is ', prob[j] print '\033[1mabstract subject = ', clf.predict(abstrct)[0], '\033[0m\n'