Skip to content

Commit

Permalink
slim version of module live and topical. removed everything that was …
Browse files Browse the repository at this point in the history
…not necessary anymore
  • Loading branch information
dirkweissenborn committed Jul 8, 2013
1 parent 65bff5e commit 48c53d5
Show file tree
Hide file tree
Showing 12 changed files with 290 additions and 180 deletions.
1 change: 1 addition & 0 deletions conf/indexing.properties
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,6 @@ org.dbpedia.spotlight.yahoo.region = us

# Topical configuration
org.dbpedia.spotlight.data.sortedArticlesCategories=/media/dirk/Data/Wikipedia/sorted.article_categories_en.nt
#only NaiveBayesTopicalClassifier up to now
org.dbpedia.spotlight.topic.classifier.type=NaiveBayesTopicalClassifier
org.dbpedia.spotlight.topic.description=conf/topic_descriptions.xml
44 changes: 1 addition & 43 deletions conf/topic_descriptions.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,98 +6,79 @@
<iptc mediatopic="20000003"/>
<iptc mediatopic="20000004"/>
<categories>Animation,Cartooning</categories>
<keywords></keywords>
</topic>

<topic name="cinema">
<iptc mediatopic="20000005"/>
<categories>Film</categories>
<keywords></keywords>
</topic>


<topic name="literature">
<iptc mediatopic="20000013"/>
<categories>Literature</categories>
<keywords></keywords>
</topic>

<topic name="music">
<iptc mediatopic="20000018"/>
<categories>Music</categories>
<keywords></keywords>
<categories>Music,Music_genres,20th-century_music_genres</categories>
</topic>

<topic name="performing_arts"> <!--"theatre_dance_opera" renamed because many things from all performing arts occured-->
<iptc mediatopic="20000028"/>
<iptc mediatopic="20000029"/>
<iptc mediatopic="20000007"/>
<categories>Opera,Opera_genres,Dance,Theatre</categories>
<keywords></keywords>
</topic>

<!-- Visual arts -->

<topic name="architecture">
<iptc mediatopic="20000032"/>
<categories>Architecture</categories>
<keywords></keywords>
<!--feed url="http://topics.nytimes.com/top/reference/timestopics/subjects/a/architecture/index.html?rss=1"/>
<feed url="http://www.architectsjournal.co.uk/XmlServers/navsectionRSS.aspx?navsectioncode=3"/-->
</topic>

<!--topic name="fashion">
<iptc mediatopic="20000011"/>
<categories>Fashion,Clothing</categories>
<keywords></keywords>
</topic DID NOT WORK WELL-->

<topic name="painting_drawing">
<iptc mediatopic="20000035"/>
<iptc mediatopic="20000034"/>
<categories>Painting,Drawing</categories>
<keywords></keywords>
</topic>

<topic name="sculpture">
<iptc mediatopic="20000037"/>
<categories>Sculpture</categories>
<keywords></keywords>
</topic>

<!-- economy, business, finance-->

<topic name="economy_business_finance">
<iptc mediatopic="20000344"/>
<categories>Business,Finance</categories>
<keywords></keywords>
</topic>

<!-- natural science -->

<topic name="biology">
<iptc mediatopic="20000719"/>
<categories>Biology</categories>
<keywords></keywords>
<!--feed url="http://feeds.biologynews.net/biologynews/headlines?format=xml"/-->
</topic>

<topic name="chemistry">
<iptc mediatopic="20000725"/>
<categories>Chemistry</categories>
<keywords></keywords>
</topic>

<topic name="geology_prehistoriclife"><!--"geology"-->
<iptc mediatopic="20000727"/>
<categories>Geology</categories>
<keywords></keywords>
</topic>

<topic name="physics">
<iptc mediatopic="20000731"/>
<categories>Physics</categories>
<keywords></keywords>
</topic>


Expand All @@ -106,28 +87,24 @@
<topic name="technology_engineering">
<iptc mediatopic="20000756"/>
<categories>Technology,Engineering</categories>
<keywords></keywords>
</topic>

<!-- Crime, Law, Justice -->

<topic name="crime">
<iptc mediatopic="20000082"/>
<categories>Criminology,Crime</categories>
<keywords></keywords>
</topic>

<topic name="law">
<iptc mediatopic="20000121"/>
<categories>Law</categories>
<keywords></keywords>
</topic>

<!-- Education -->
<topic name="education">
<iptc mediatopic="05000000"/>
<categories>Education</categories>
<keywords></keywords>
</topic>


Expand All @@ -136,51 +113,43 @@
<iptc mediatopic="20000248"/>
<iptc mediatopic="20000244"/>
<categories>Food_and_drink,Cuisine</categories>
<keywords></keywords>
</topic>

<!-- Social Sciences -->

<topic name="anthropology">
<iptc mediatopic="20000743"/>
<categories>Anthropology</categories>
<keywords></keywords>
</topic>

<topic name="archaeology">
<iptc mediatopic="20000744"/>
<categories>Archaeology</categories>
<keywords></keywords>
</topic>

<topic name="economics">
<iptc mediatopic="20000745"/>
<categories>Economics</categories>
<keywords></keywords>
</topic>

<topic name="geography">
<iptc mediatopic="20000746"/>
<categories>Geography,Places</categories>
<keywords></keywords>
</topic>

<topic name="history">
<iptc mediatopic="20000747"/>
<categories>History,Chronology</categories>
<keywords></keywords>
</topic>

<topic name="philosophy">
<iptc mediatopic="20000751"/>
<categories>Philosophy</categories>
<keywords></keywords>
</topic>

<topic name="psychology">
<iptc mediatopic="20000753"/>
<categories>Psychology</categories>
<keywords></keywords>
</topic>

<!-- politics -->
Expand All @@ -189,78 +158,67 @@
<iptc mediatopic="11000000"/>
<iptc mediatopic="20000752"/>
<categories>Politics,Political_science</categories>
<keywords></keywords>
</topic>

<!-- Health -->
<topic name="health">
<iptc mediatopic="07000000"/>
<categories>Health,Diseases_and_disorders,Health_sciences</categories>
<keywords></keywords>
</topic>

<!-- Structural Science -->

<topic name="mathematics">
<iptc mediatopic="20000715"/>
<categories>Mathematics</categories>
<keywords></keywords>
</topic>


<topic name="computer_science">
<iptc mediatopic="20000763"/>
<categories>Computer_science,Computing‎</categories>
<keywords></keywords>
</topic>

<!-- Sport -->
<topic name="sport">
<iptc mediatopic="15000000"/>
<categories>Sports,Sports_by_year</categories>
<keywords></keywords>
</topic>

<!-- Mass media -->

<topic name="mass_media">
<iptc mediatopic="20000045"/>
<categories>Television,Radio,Mass_media,News,Journalism</categories>
<keywords></keywords>
</topic>

<!-- Culture -->

<topic name="religion_belief">
<iptc mediatopic="12000000"/>
<categories>Religion,Belief</categories>
<keywords></keywords>
</topic>


<topic name="transport">
<iptc mediatopic="20000337"/>
<categories>Transport</categories>
<keywords></keywords>
</topic>

<!--topic name="agriculture">
<iptc mediatopic="20000210"/>
<categories>Agriculture</categories>
<keywords></keywords>
</topic DID NOT WORK WELL-->

<topic name="video_game">
<iptc mediatopic="20000548"/>
<categories>Video_game_culture,Video_games</categories>
<keywords></keywords>
</topic>


<topic name="war">
<iptc mediatopic="20000056"/>
<categories>War,Military</categories>
<keywords></keywords>
</topic>

</topics>
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
<module>eval</module>
<module>uima</module>
<module>dist</module>
<module>live</module>
<module>topical</module>
</modules>

<build>
Expand Down Expand Up @@ -288,6 +290,8 @@
</configuration>
</execution>

<!--Dependencies for dbpedia spotlight live-->

<execution>
<id>install-hunposchain0.6_mod-jar</id>
<phase>install</phase>
Expand Down Expand Up @@ -363,6 +367,8 @@
</configuration>
</execution>

<!--Spotlight live dependencies-->

</executions>
</plugin>

Expand Down
2 changes: 1 addition & 1 deletion topical/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@
<dependency>
<groupId>cc.factorie</groupId>
<artifactId>factorie</artifactId>
<version>1.0.0-M3</version>
<version>1.0.0-M4</version>
</dependency>

</dependencies>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ object TopicDescription {
for (topicItem <- xml \\ "topic") yield {
val topic = new Topic((topicItem \\ "@name").head.text) // HACK: bug fix Computer_science got read with more than 16 characters
val categories = (topicItem \\ "categories").head.text.split(",").map(category => category.toCharArray.subSequence(0, category.length).toString.trim)
val keywords = (topicItem \\ "keywords").head.text.split(",").map(category => category.toCharArray.subSequence(0, category.length).toString.trim)
//val keywords = (topicItem \\ "keywords").head.text.split(",").map(category => category.toCharArray.subSequence(0, category.length).toString.trim)

var iptcTopics = Set[String]()
for (iptcItem <- topicItem \\ "iptc")
Expand All @@ -34,9 +34,9 @@ object TopicDescription {
for (feedItem <- topicItem \\ "feed")
feeds += new URL((feedItem \\ "@url").head.text)

new TopicDescription(topic, categories, keywords, iptcTopics, feeds)
TopicDescription(topic, categories, iptcTopics, feeds)
}
}
}

class TopicDescription(val topic: Topic, val categories: Seq[String], val keywords: Seq[String], val iptcTopics: Set[String], val rssFeeds: Set[URL])
case class TopicDescription(topic: Topic, categories: Seq[String],iptcTopics: Set[String], rssFeeds: Set[URL])
Loading

0 comments on commit 48c53d5

Please sign in to comment.