3/11/2019

Zoher15 · Zoher15 · commit 12c854925e9f · 2019-03-11T11:16:41.000-04:00
diff --git a/RDF Files/__pycache__/settings.cpython-36.pyc b/RDF Files/__pycache__/settings.cpython-36.pyc
diff --git a/RDF Files/select rdf.py b/RDF Files/select rdf.py
@@ -0,0 +1,12 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import pandas as pd
+import settings
+from sqlalchemy import create_engine
+from sqlalchemy.engine.url import URL
+engine=create_engine(URL(**settings.DATABASE),connect_args={'charset':'utf8'})
+data=pd.read_sql_query('SELECT cl.claimID FROM centralifact.claim cl inner join claim_entity ce on cl.claimID=ce.claimID inner join entity e on e.entityID=ce.entityID where e.entity_text like %s',con=engine,params=("Hillary Clinton%",))
+print(data['claimID'])
+
+data=pd.read_sql_query('SELECT cl.claimID FROM centralifact.claim cl inner join claim_entity ce on cl.claimID=ce.claimID inner join entity e on e.entityID=ce.entityID where e.entity_text like %s',con=engine,params=("Donald Trump%",))
+print(data['claimID'])
diff --git a/RDF Files/settings.py b/RDF Files/settings.py
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for centralifact project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'centralifact'
+SPIDER_MODULES = ['centralifact.spiders']
+NEWSPIDER_MODULE = 'centralifact.spiders'
+DATABASE={
+	'drivername': 'mysql',
+    'host': 'burns.cs.indiana.edu',
+    'port': '3306',
+    'username': 'centralifact',
+    'password': 'Z-J-K+C-Fact=My+Sql',
+    'database': 'centralifact'
+}
+USER_AGENT = 'Indiana-University-Researcher-Zoher-Kachwala(zkachwal@iu.edu)'
+SPIDER_MIDDLEWARES = {
+    'scrapy_deltafetch.DeltaFetch': 100,
+}
+DELTAFETCH_ENABLED = True
+DELTAFETCH_RESET = False
+AUTOTHROTTLE_ENABLED = True
+ROBOTSTXT_OBEY = True
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'centralifact (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+#ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'centralifact.middlewares.CentralifactSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'centralifact.middlewares.CentralifactDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'centralifact.pipelines.CentralifactPipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/neo4j_commands.cypher b/neo4j_commands.cypher
@@ -15,7 +15,7 @@ RETURN arg1,rel,arg2
 
 //Loading RDF
 CREATE INDEX ON :Resource(uri)#prestep
-CALL semantics.importRDF("file:///claimreviews_db_2_12_10_52AM.rdf","RDF/XML", { shortenUrls: false, typesToLabels: true, commitSize: 9000 })
+CALL semantics.importRDF("file:///claimreviews_db_2_12_10_52AM.rdf","RDF/XML", { shortenUrls: false, typesToLabels: false, commitSize: 9000 })
 
 CALL semantics.importRDF("file:///C:/Users/zoya/Desktop/Zoher/factcheckgraph/claimreviews_db_2_12_10_52AM.rdf","RDF/XML", { shortenUrls: false, typesToLabels: true, commitSize: 9000 })
 
diff --git a/neo4j_sequence.cypher b/neo4j_sequence.cypher
@@ -1,18 +1,45 @@
+///Setting Degree
+MATCH (n) WITH n,
+MATCH (n)-[r]->() with n,r,size((n)-[r]->()) as degree
+RETURN degree
+
+MATCH (n) WITH collect(n) as nodes
+unwind nodes as node
+MATCH (node)-[r]-() with r,size((node)-[r]->()) as degree
+SET r.degree=degree
+return r.degree
+
+MATCH (n)-[r]->(m)
+WITH n, count(m) as c, collect(r) as rs
+UNWIND rs as r
+set n.outdegree=c
+set r.degree=c
+RETURN r.degree
+
+
 ////Get Donald_Trump rels
 MATCH (n{uri:'http://dbpedia.org/resource/Donald_Trump'})-[r]-(m) Return n,r,m LIMIT 20
 MATCH (n{uri:'http://dbpedia.org/resource/Barack_Obama'})-[r]-(m) Return n,r,m LIMIT 20
 http://dbpedia.org/resource/Barack_Obama
 MATCH (n{uri:'http://dbpedia.org/resource/Donald_Trump'})-[r*1..2]-(m) Return n,r,m LIMIT 50
 MATCH a=(n:`schema.org/Person`)-[r*1..3]->(m:`schema.org/Person`) Return a LIMIT 20
 MATCH a=(n:`schema.org/Person`)-[r*1..3]->(m:`schema.org/Person`) Return a LIMIT 20
-MATCH a=(n{uri:'http://dbpedia.org/resource/Donald_Trump'})-[*0..3]-(m) where m.uri starts with 'http://dbpedia.org/resource/' Return a LIMIT 100
+MATCH a=(n{uri:'http://dbpedia.org/resource/Donald_Trump'})-[*1..5]-(m) where m.uri starts with 'http://dbpedia.org/resource/' Return a LIMIT 100
 //
 MATCH a=(n{uri:'http://dbpedia.org/resource/Donald_Trump'})-[*1..10]->(m{uri:'http://dbpedia.org/resource/Hillary_Clinton'}) Return a LIMIT 10
 MATCH (from{uri:'http://dbpedia.org/resource/Donald_Trump'}),(to{uri:'http://dbpedia.org/resource/Hillary_Clinton'}) CALL algo.shortestPath.stream(from, to, "cost") 
 yield path as path, cost as cost
 return path,cost 
-
-MATCH (from{uri:'http://dbpedia.org/resource/Donald_Trump'}),(to{uri:'http://dbpedia.org/resource/United_States'}),path = shortestpath((from)-[*]-(to))
+//
+MATCH (n) WHERE n.uri = 'http://dbpedia.org/resource/Hillary_Clinton'
+CALL apoc.path.subgraphNodes(n, {maxLevel:1,whitelistNodes:"+'http://schema.org/Person'"}) YIELD node
+RETURN node
+//
+MATCH (n{uri:'http://dbpedia.org/resource/Hillary_Clinton'}),(m) where m.uri starts with 'http://dbpedia.org/resource/'
+CALL apoc.path.subgraphNodes(n, {maxLevel:10,whitelistNodes:m}) YIELD node
+RETURN node
+//
+MATCH (from{uri:'http://dbpedia.org/resource/Vladimir_Putin'}),(to{uri:'http://dbpedia.org/resource/United_States'}),path = shortestpath((from)-[*]-(to))
 with path
 WHERE length(path)>2
 return path
@@ -34,7 +61,7 @@ MATCH (n)-[r]-(n) delete r
 
 //1 Import
 CREATE INDEX ON :Resource(uri)//prestep
-CALL semantics.importRDF("file:///C:/Users/zoya/Downloads/claimreviews_db.rdf","RDF/XML", { shortenUrls: false, typesToLabels: true, commitSize: 25000 })
+CALL semantics.importRDF("file:///C:/Users/zoya/Downloads/claimreviews_db.rdf","RDF/XML", { shortenUrls: false, typesToLabels: false, commitSize: 25000 })
 //2 labeling the nodes
 MATCH (n) with n, SPLIT(n.uri,"/")[-1] as name SET n.label=name return n.label
 MATCH (n) with n, SPLIT(n.label,"#")[-1] as name SET n.label=name return n.label
@@ -90,4 +117,9 @@ delete r1,o,r,m
 //
 MATCH ()-[r:`http://www.ontologydesignpatterns.org/ont/fred/quantifiers.owl#hasDeterminer`]->(m)
 OPTIONAL MATCH (m)-[r1]->(o)
-delete r1,o,r,m
+delete r1,o,r,m
+
+MATCH (n) with collect([n,n.uri]) as events
+unwind events as event
+set event[0].label=semantics.getIRILocalName(event[1])
+return event