diff --git a/project.clj b/project.clj index e73fc92..d2497ab 100644 --- a/project.clj +++ b/project.clj @@ -3,7 +3,7 @@ :url "http://birdwave.neo.com" :license {:name "Eclipse Public License" :url "http://www.eclipse.org/legal/epl-v10.html"} - :dependencies [[org.clojure/clojure "1.6.0"] + :dependencies [[org.clojure/clojure "1.8.0"] [io.pedestal/pedestal.service "0.2.2"] [io.pedestal/pedestal.service-tools "0.2.2"] [com.datomic/datomic-pro "0.9.4815.12" @@ -16,7 +16,10 @@ [com.cognitect/transit-cljs "0.8.188"] [org.omcljs/om "0.8.7" :as om] [cljsjs/d3 "3.5.3-0"] - [com.cemerick/url "0.1.1"]] + [com.cemerick/url "0.1.1"] + [cascalog/cascalog-core "3.0.0"] + [cascalog/cascalog-more-taps "3.0.0"] + [com.netflix.pigpen/pigpen-pig "0.3.2"]] :plugins [[lein-cljsbuild "1.0.3"] [datomic-schema-grapher "0.0.1"] [ohpauleez/lein-pedestal "0.1.0-beta10"]] @@ -77,7 +80,10 @@ :profiles {:dev {:dependencies [[io.pedestal/pedestal.jetty "0.2.2"] [datomic-schema-grapher "0.0.1"] [ankha "0.1.1"] - [org.clojure/tools.namespace "0.2.4"]] + [org.clojure/tools.namespace "0.2.4"] + [org.apache.pig/pig "0.13.0"] + [org.apache.hadoop/hadoop-core "1.2.1"]] + :jvm-opts ["-Xms768m" "-Xmx768m"] :source-paths ["dev/clj"]}} diff --git a/sample_data/birds.txt b/sample_data/birds.txt index 89282ec..91f18c3 100644 --- a/sample_data/birds.txt +++ b/sample_data/birds.txt @@ -1,4 +1,3 @@ -˙GLOBAL UNIQUE IDENTIFIER TAXONOMIC ORDER CATEGORY COMMON NAME SCIENTIFIC NAME SUBSPECIES COMMON NAME SUBSPECIES SCIENTIFIC NAME OBSERVATION COUNT BREEDING BIRD ATLAS CODE AGE/SEX COUNTRY COUNTRY_CODE STATE_PROVINCE SUBNATIONAL1_CODE COUNTY SUBNATIONAL2_CODE IBA CODE LOCALITY LOCALITY ID LOCALITY TYPE LATITUDE LONGITUDE OBSERVATION DATE TIME OBSERVATIONS STARTED TRIP COMMENTS SPECIES COMMENTS OBSERVER ID FIRST NAME LAST NAME SAMPLING EVENT IDENTIFIER PROTOCOL TYPE PROJECT CODE DURATION MINUTES EFFORT DISTANCE KM EFFORT AREA HA NUMBER OBSERVERS ALL SPECIES REPORTED GROUP IDENTIFIER APPROVED REVIEWED REASON URN:CornellLabOfOrnithology:EBIRD:OBS175897027 21475 species American Dipper Cinclus mexicanus 4 United States US Alaska US-AK Aleutians West US-AK-016 US-AK_4441 Captain's Bay L1875942 P 53.8412816 -166.5905365 2012-12-29 23:04:00 These are the birds counted while covering my section (Captain's Bay) of the Unalaska Island Count circle for the 113th CBC. I covered four miles by road and 1.5 miles (3 roundtrip) by foot. Temps 35 to 38 F, drizzle turning to heavy rain as the day went on, overcast with light NE winds, fresh water partly frozen. 6"to 12" snow depth. obs280983 Suzi Golodoff S12544775 eBird - Traveling Count EBIRD 300 8.851 1 1 1 0 URN:CornellLabOfOrnithology:EBIRD:OBS175897003 29709 species American Tree Sparrow Spizella arborea 1 United States US Alaska US-AK Aleutians West US-AK-016 US-AK_4441 Captain's Bay L1875942 P 53.8412816 -166.5905365 2012-12-29 23:04:00 These are the birds counted while covering my section (Captain's Bay) of the Unalaska Island Count circle for the 113th CBC. I covered four miles by road and 1.5 miles (3 roundtrip) by foot. Temps 35 to 38 F, drizzle turning to heavy rain as the day went on, overcast with light NE winds, fresh water partly frozen. 6"to 12" snow depth. A casual/accidental species in Unalaska. There are three individuals in the area now ( first week January 2013). Two showed up in Captain's Bay on December 15th, 2012 ( relocated for the CBC) and a third has been frequenting a local feeder. I have photos of that bird. obs280983 Suzi Golodoff S12544775 eBird - Traveling Count EBIRD 300 8.851 1 1 1 1 URN:CornellLabOfOrnithology:EBIRD:OBS175897012 2881 species Bald Eagle Haliaeetus leucocephalus 90 United States US Alaska US-AK Aleutians West US-AK-016 US-AK_4441 Captain's Bay L1875942 P 53.8412816 -166.5905365 2012-12-29 23:04:00 These are the birds counted while covering my section (Captain's Bay) of the Unalaska Island Count circle for the 113th CBC. I covered four miles by road and 1.5 miles (3 roundtrip) by foot. Temps 35 to 38 F, drizzle turning to heavy rain as the day went on, overcast with light NE winds, fresh water partly frozen. 6"to 12" snow depth. obs280983 Suzi Golodoff S12544775 eBird - Traveling Count EBIRD 300 8.851 1 1 1 0 diff --git a/src/clj/bird_wave/birdlog.clj b/src/clj/bird_wave/birdlog.clj new file mode 100644 index 0000000..3ca6afa --- /dev/null +++ b/src/clj/bird_wave/birdlog.clj @@ -0,0 +1,109 @@ +(ns bird-wave.birdlog + (:require [cascalog.api :refer :all] + [cascalog.logic.ops :as c] + [cascalog.more-taps :as taps] + [clojure.string :as cs] + [clojure.instant :as inst])) + +(def field-positions + [:sighting/guid ;; "GLOBAL UNIQUE IDENTIFIER" + :taxon/order ;; "TAXONOMIC ORDER" + nil ;; "CATEGORY" + :taxon/common-name ;; "COMMON NAME" + :taxon/scientific-name ;; "SCIENTIFIC NAME" + :taxon/subspecies-common-name ;; "SUBSPECIES COMMON NAME" + :taxon/subspecies-scientific-name;; "SUBSPECIES SCIENTIFIC NAME" + :sighting/count ;; "OBSERVATION COUNT" ;; x indicates uncounted + nil ;; "BREEDING BIRD ATLAS CODE" + nil ;; "AGE/SEX" + nil ;; "COUNTRY" + nil ;; "COUNTRY_CODE" + :sighting/state ;; "STATE_PROVINCE" + :sighting/state-code ;; "SUBNATIONAL1_CODE" + :sighting/county ;; "COUNTY" + :sighting/county-code ;; "SUBNATIONAL2_CODE" + nil ;; "IBA CODE" + :sighting/locality ;; "LOCALITY" + nil ;; "LOCALITY ID" + nil ;; "LOCALITY TYPE" + :sighting/latitude ;; "LATITUDE" + :sighting/longitude ;; "LONGITUDE" + :sighting/date ;; "OBSERVATION DATE" + nil ;; "TIME OBSERVATIONS STARTED" + nil ;; "TRIP COMMENTS" + nil ;; "SPECIES COMMENTS" + nil ;; "OBSERVER ID" + nil ;; "FIRST NAME" + nil ;; "LAST NAME" + nil ;; "SAMPLING EVENT IDENTIFIER" + nil ;; "PROTOCOL TYPE" + nil ;; "PROJECT CODE" + nil ;; "DURATION MINUTES" + nil ;; "EFFORT DISTANCE KM" + nil ;; "EFFORT AREA HA" + nil ;; "NUMBER OBSERVERS" + nil ;; "ALL SPECIES REPORTED" + nil ;; "GROUP IDENTIFIER" + nil ;; "APPROVED" + nil ;; "REVIEWED" + nil ;; "REASON" + ]) + +(def fields (concat (remove nil? field-positions) + [:sighting/month-yr])) + +(defn sighting + "given a line of text, split on tabs and return the fields we care about + (indicated by non-nil presence in fields vector)" + [plaintext-row] + (into {} + (remove nil? + (map #(if % [% %2]) + field-positions + (cs/split plaintext-row #"\t"))))) + +(defn coerce [m f & [key & keys]] + (if key + (recur (assoc m key (f (get m key))) f keys) + m)) + +(defn parse-count [s] + (if (= "X" s) 1 (Long/parseLong s))) + +(defn coerce-values [sighting] + (let [date (inst/read-instant-date (:sighting/date sighting)) ] + (-> sighting + (coerce bigdec + :sighting/latitude + :sighting/longitude) + (coerce parse-count + :sighting/count) + (assoc :sighting/date date) + (assoc :sighting/month-yr (format "%tY/%tm" date date)) + ))) + +(defn ordered-values + "return sighting values in the order they appear in field-positions" + [sighting] + (map sighting fields)) + +(defmapop parse-line [line] + (ordered-values (coerce-values (sighting line)))) + +(comment + (taps/hfs-delimited "sample_data/out/" + :delimiter "," + :classes false + :write-header? true + :quote true + :sink-template "%s/%s" + :templatefields ["order" "month-yr"]) + + + (?- (hfs-textline "sample_data/out/" :sink-template "%s/%s" :templatefields ["?order" "?month-yr"]) + (<- [?order ?state ?county ?month-yr ?total] + ((hfs-textline "sample_data/birds.txt") ?line) + (parse-line ?line :> ?guid ?order ?common-name ?scientific-name ?subspecies-common-name ?subspecies-scientific-name ?count ?state ?state-code ?county ?county-code ?locality ?latitude ?longitude ?date ?month-yr) + (c/sum ?count :> ?total))) + + ) diff --git a/src/clj/bird_wave/import.clj b/src/clj/bird_wave/import.clj index 927bf2e..b98dda2 100644 --- a/src/clj/bird_wave/import.clj +++ b/src/clj/bird_wave/import.clj @@ -5,7 +5,6 @@ [datomic.api :as d])) (def fields - ;; TODO: consider including guid as a primary key for sighting [:sighting/guid ;; "GLOBAL UNIQUE IDENTIFIER" :taxon/order ;; "TAXONOMIC ORDER" nil ;; "CATEGORY" diff --git a/src/clj/bird_wave/pigpen.clj b/src/clj/bird_wave/pigpen.clj new file mode 100644 index 0000000..3736812 --- /dev/null +++ b/src/clj/bird_wave/pigpen.clj @@ -0,0 +1,131 @@ +(ns bird-wave.pigpen + (:require [pigpen.core :as pig] + [pigpen.fold :as fold] + [clojure.string :as cs] + [clojure.instant :as inst])) + +(defn birds-file + [filename] + (pig/load-tsv filename)) + +(def field-positions + [:sighting/guid ;; "GLOBAL UNIQUE IDENTIFIER" + :taxon/order ;; "TAXONOMIC ORDER" + nil ;; "CATEGORY" + :taxon/common-name ;; "COMMON NAME" + :taxon/scientific-name ;; "SCIENTIFIC NAME" + :taxon/subspecies-common-name ;; "SUBSPECIES COMMON NAME" + :taxon/subspecies-scientific-name;; "SUBSPECIES SCIENTIFIC NAME" + :sighting/count ;; "OBSERVATION COUNT" ;; x indicates uncounted + nil ;; "BREEDING BIRD ATLAS CODE" + nil ;; "AGE/SEX" + nil ;; "COUNTRY" + nil ;; "COUNTRY_CODE" + :sighting/state ;; "STATE_PROVINCE" + :sighting/state-code ;; "SUBNATIONAL1_CODE" + :sighting/county ;; "COUNTY" + :sighting/county-code ;; "SUBNATIONAL2_CODE" + nil ;; "IBA CODE" + :sighting/locality ;; "LOCALITY" + nil ;; "LOCALITY ID" + nil ;; "LOCALITY TYPE" + :sighting/latitude ;; "LATITUDE" + :sighting/longitude ;; "LONGITUDE" + :sighting/date ;; "OBSERVATION DATE" + nil ;; "TIME OBSERVATIONS STARTED" + nil ;; "TRIP COMMENTS" + nil ;; "SPECIES COMMENTS" + nil ;; "OBSERVER ID" + nil ;; "FIRST NAME" + nil ;; "LAST NAME" + nil ;; "SAMPLING EVENT IDENTIFIER" + nil ;; "PROTOCOL TYPE" + nil ;; "PROJECT CODE" + nil ;; "DURATION MINUTES" + nil ;; "EFFORT DISTANCE KM" + nil ;; "EFFORT AREA HA" + nil ;; "NUMBER OBSERVERS" + nil ;; "ALL SPECIES REPORTED" + nil ;; "GROUP IDENTIFIER" + nil ;; "APPROVED" + nil ;; "REVIEWED" + nil ;; "REASON" + ]) + +(def fields (concat (remove nil? field-positions) + [:sighting/month-yr])) + +(defn sighting + "given a line of text, split on tabs and return the fields we care about + (indicated by non-nil presence in fields vector)" + [row] + (->> row + (map #(if % [% %2]) field-positions) + (remove nil?) + (into {}))) + +(defn coerce [m f & [key & keys]] + (if key + (recur (assoc m key (f (get m key))) f keys) + m)) + +(defn parse-count [s] + (if (= "X" s) 1 (Long/parseLong s))) + +(defn coerce-values [sighting] + (let [date (inst/read-instant-date (:sighting/date sighting)) ] + (-> sighting + (coerce bigdec + :sighting/latitude + :sighting/longitude) + (coerce parse-count + :sighting/count) + (assoc :sighting/date date) + (assoc :sighting/month-yr (format "%tY/%tm" date date)) + ))) + +(defn birds-by-us-county + [data] + (->> data + (pig/map + (fn [row] + (-> row + (sighting) + (coerce-values)))) + (pig/group-by (juxt :taxon/order :sighting/month-yr :sighting/state :sighting/county) + {:fold (fold/juxt (fold/count) + (->> (fold/map :sighting/count) (fold/sum)) + (->> (fold/map :sighting/count) (fold/avg)))}) + (pig/map flatten))) + +(defn species + [data] + (->> data + (pig/map + (fn [row] + (-> row + (sighting) + (coerce-values)))) + (pig/group-by (juxt :taxon/order :taxon/common-name) + {:fold (fold/count)}) + (pig/map flatten) + ;; (pig/filter (fn [x] (< 1 (last x)))) + )) + +(defn transform-birds + [input-file output-file] + (->> (birds-file input-file) + (birds-by-us-county) + (pig/store-clj output-file))) + +(comment + + (pig/dump (->> "sample_data/birds.txt" + (birds-file) + (birds-by-us-county))) + + (pig/dump (->> "sample_data/birds.txt" + (birds-file) + (species))) + + )