diff --git a/.gitignore b/.gitignore index e2da8db..1b33ff0 100644 --- a/.gitignore +++ b/.gitignore @@ -97,4 +97,6 @@ ENV/ .ropeproject # mkdocs documentation -/site \ No newline at end of file +/site + +.idea diff --git a/README.md b/README.md index c95f37b..e57c4a0 100644 --- a/README.md +++ b/README.md @@ -38,4 +38,7 @@ $ python handler.py # Changelog -1.0 - Release \ No newline at end of file +``` +1.0 - Release +1.1 - Add keywords search +``` diff --git a/css/main.css b/css/main.css index dc2a427..6b2e3de 100755 --- a/css/main.css +++ b/css/main.css @@ -7,21 +7,21 @@ padding: 10px; } -#results { font-family: Verdana; font-size: 14px; } +#results { font-family: Verdana; font-size: 14px; } #results h1 { font-size: 150%; margin: 0; padding: 5px; -} -#results h2 { font-size: 125%; color: #666; } +} +#results h2 { font-size: 125%; color: #666; } #results .file { padding: 5px; } -#results .link { - padding: 3px; +#results .link { + padding: 3px; border: 1px solid #555; background: #eee none repeat scroll 0 0; -} +} #results .highlight { color: #000; background: yellow; } #results .result { @@ -35,7 +35,7 @@ background-color: #000 !important; border-color: #000 !important; } -.navbar-inverse .navbar-collapse, +.navbar-inverse .navbar-collapse, .navbar-inverse .navbar-form { background-color: #000; } diff --git a/handler.py b/handler.py index dbc2d26..ef51d6b 100644 --- a/handler.py +++ b/handler.py @@ -1,6 +1,7 @@ from __future__ import print_function import tornado.ioloop, tornado.web, tornado.autoreload from tornado.escape import json_encode, json_decode +import json import safeurl, types, sys, re, mimetypes, glob, jsbeautifier, urlparse, pycurl import calendar, time, datetime @@ -15,7 +16,7 @@ #------------------------------------------------------------ class BaseHandler(tornado.web.RequestHandler): - + def get_current_user(self): return [] @@ -39,7 +40,7 @@ def get_current_user(self): class MainHandler(BaseHandler): def initialize(self): return - + def get(self): self.render( 'templates/index.html', @@ -52,13 +53,13 @@ def get(self): class ViewAboutHandler(BaseHandler): def initialize(self): return - + def get(self): self.render( 'templates/about.html', ) - + #------------------------------------------------------------ # /parse/ajax #------------------------------------------------------------ @@ -77,7 +78,7 @@ def find_str(self, s, char): return index index += 1 return -1 - + def findEntireLine(self, contents, str): lineNum = 0 for item in contents.split("\n"): @@ -85,13 +86,13 @@ def findEntireLine(self, contents, str): linkPos = self.find_str(item, str) return item,lineNum,linkPos lineNum = lineNum+1 - + def parseForLinks(self, contents): discoveredLinks = [] outputLinks = [] # ugh lol regex = r"[^/][`'\"]([\/][a-zA-Z0-9_.-]+)+(?!([gimuy]*[,;\s])|\/\2)" - links = re.finditer(regex, contents) + links = re.finditer(regex, contents) for link in links: linkStr = link.group(0) # discoveredLinks list to avoid dupes and complex dupe checks @@ -107,6 +108,33 @@ def parseForLinks(self, contents): }) return outputLinks + def parseForKeywords(self, contents, keywords=[]): + if len(keywords) == 0: + return [] + + discoveredLinks = [] + outputLinks = [] + # ugh yeah + + for keyword in keywords: + regex = r".*"+re.escape(keyword)+".*" + links = re.finditer(regex, contents) + for link in links: + linkStr = link.group(0) + # discoveredLinks list to avoid dupes and complex dupe checks + if linkStr not in discoveredLinks: + # get the entire line, line number, and link position + entireLine,lineNum,linkPos = self.findEntireLine(contents, linkStr) + discoveredLinks.append(linkStr) + # print(entireLine) + outputLinks.append({ + "line": entireLine, + "link": linkStr, + "lineNum": lineNum, + "linkPos": linkPos + }) + return outputLinks + def getFormattedTimestamp(self): d = datetime.datetime.now() formatted = "{}_{}_{}_{}-{}".format(d.month, d.day, d.year, d.hour, d.minute) @@ -115,25 +143,23 @@ def getFormattedTimestamp(self): def formatHTMLOutput(self, html): output = output + html return output - + def beautifyJS(self, content): return jsbeautifier.beautify(content) def isLongLine(self, line): - if len(line)>1000: - return True - return False - - def fileRoutine(self, url, content): + return len(line)>1000 + + def fileRoutine(self, url, content, keywords): html = "" - + # beautify the JS for cleaner parsing # note: this can be slow against large JS files and can lead to failure prettyContent = self.beautifyJS(content) - + # parse all the links out - parsedLinks = self.parseForLinks(prettyContent) - + parsedLinks = self.parseForLinks(prettyContent) + self.parseForKeywords(prettyContent, keywords) + # if we have results, start building HTML if parsedLinks: print("Discovered {} links in {}".format(len(parsedLinks), url)) @@ -141,7 +167,7 @@ def fileRoutine(self, url, content): # html = html+'

{}

'.format(url) html = html+'
' for link in parsedLinks: - html = html+"

{}

".format(link["link"][1:]) + html = html+"

{}

".format(link["link"][0:].replace("<", "<")) # Get positions for highlighting startPos = link["linkPos"] endPos = link["linkPos"]+len(link["link"]) @@ -179,38 +205,44 @@ def fetchURL(self, url, headers=[]): res = sc.execute(url) return res - def parseLinks(self, url, headers=[]): + def parseLinks(self, url, headers=[], keywords=[]): html = "" file = self.fetchURL(url, headers) - html = html + self.fileRoutine(url, file) + html = html + self.fileRoutine(url, file, keywords) return html def post(self): - + error = False errorMsg = "" - + url = self.get_argument("url") headers = self.get_argument("headers", []) + keywords = self.get_argument("keywords", False) + if (keywords == False): + keywords = [] + else: + keywords = json.loads(keywords) + if error == False: - - data = self.parseLinks(url, headers) + + data = self.parseLinks(url, headers, keywords) # set content-type self.set_header('Content-Type', 'application/json') - + # output self.write(json_encode({ "url": url, "output": data, })) - + else: self.write("error") - + #------------------------------------------------------------ # Main #------------------------------------------------------------ diff --git a/js/parsejs.js b/js/parsejs.js index aad7ba1..135b937 100755 --- a/js/parsejs.js +++ b/js/parsejs.js @@ -10,12 +10,16 @@ class Result { this.result = result; this.output = output; } -} +} function toggleHeaders() { $("#headers").toggle(); } +function toggleKeywords() { + $("#keywords").toggle(); +} + function getCustomHeaders() { var headers = $('#customHeaders').val().split("\n"); var customHeaders = []; @@ -27,10 +31,19 @@ function getCustomHeaders() { return null; } +function removeDuplicateAndEmpty(vals){ + var uniqueVals = []; + $.each(vals, function(i, el){ + if(el !== '' && $.inArray(el, uniqueVals) === -1) uniqueVals.push(el); + }); + return uniqueVals; +} + function parseJS(url) { $.post("/parse/ajax", { url: url, headers: getCustomHeaders(), + keywords: JSON.stringify(removeDuplicateAndEmpty($('#customKeywords').val().split("\n"))) }, function(data) { var succeeded = false; // success if we have output @@ -142,7 +155,7 @@ function hideResults() { $(function() { rebuildResults(); - $('#hideResults').change(function() { + $('#hideResults').change(function() { toggleResults(); }); -}); \ No newline at end of file +}); diff --git a/templates/index.html b/templates/index.html index 4f910b6..0b16b71 100755 --- a/templates/index.html +++ b/templates/index.html @@ -1,7 +1,7 @@ {% module Template("header.html") %}

Parse URLs

- +
@@ -11,6 +11,32 @@

Parse URLs

+
+ + +