Skip to content

Proof-of-concept shell based on IPython #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
cssselect
lxml
ipython
126 changes: 126 additions & 0 deletions shell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Parslepy shell to help building parselets on the command-line
"""

from __future__ import print_function
import re
import json

import parslepy
import lxml


def get_prop_name(key):
return re.split('[?]?[(]', key)[0]


class ParslepyShellSession(object):
def __init__(self, doc_path, outfile):
self.doc_path = doc_path
self.outfile = outfile
self.doc = self.load_document(doc_path)
self.selector = parslepy.selectors.DefaultSelectorHandler()
self.parselet_obj = {}

def load_document(self, doc_path):
parser = lxml.etree.HTMLParser()
return lxml.etree.parse(doc_path, parser=parser).getroot()

def extract_expr(self, expr):
# TODO: the current output for this may be too terse.
# In a real-world scenario I'd want to have the option of seeing
# the actual HTML too, not only the text
return self.selector.extract(self.doc, self.selector.make(expr))

def record_simple_property(self, name, expr):
self.parselet_obj[name] = expr

def record_aggregating_property(self, name, expr, is_list=False):
key = '%s(%s)' % (name, expr)
self.parselet_obj[key] = [{}] if is_list else {}

def _get_key_for_property(self, name):
for key in self.parselet_obj:
if get_prop_name(key) == name:
return key

def record_nested_property(self, parent_name, name, expr):
key = self._get_key_for_property(parent_name)

if not key:
valid_keys = [get_prop_name(k) for k in self.parselet_obj]
raise ValueError('Invalid property: %s (valid properties: %r)' %
(parent_name, valid_keys))

target_obj = self.parselet_obj[key]
if isinstance(target_obj, list):
target_obj = target_obj[0]

target_obj[name] = expr

def write(self):
with open(self.outfile, 'w') as fp:
json.dump(self.parselet_obj, fp, indent=4)
print('Wrote parselet: %s' % self.outfile)

def extract(self):
return parslepy.Parselet(self.parselet_obj).extract(self.doc)


def _run(args):
"""Run IPython shell with a few functions to help create a parselet"""
from IPython import embed as _embed

global _session
_session = ParslepyShellSession(args.input, args.parselet)

# Here are functions available in the shell:
def fetch(url, parselet=None):
global _session
_session = ParslepyShellSession(url, parselet or args.parselet)

def test(expr):
return _session.extract_expr(expr)

def add(name, expr):
_session.record_simple_property(name, expr)

def add_list(name, expr):
_session.record_aggregating_property(name, expr, is_list=True)

def add_object(name, expr):
_session.record_aggregating_property(name, expr, is_list=False)

def add_nested(parent_name, name, expr):
_session.record_nested_property(parent_name, name, expr)

def save():
_session.write()

def extract():
return _session.extract()

banner = """
Available functions:

test(expression) - test a CSS or XPath expression
fetch(url, [parselet]) - fetch a new URL, optionally starts new parselet
extract() - run the extraction using current parselet rules
add(name, expr) - add a simple property to current parselet
add_list(name, expr) - add a list property to current parselet
add_object(name, expr) - add an object property to current parselet
add_nested(parent, name, expr) - add a property to a list or object property
save() - saves current parselet to disk
\n"""
_embed(banner1=banner)


if '__main__' == __name__:
import argparse
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('parselet', help='Path to Parsley script')
parser.add_argument('input', help='URL or file')

args = parser.parse_args()
_run(args)