Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Django app to consume and store 990 data and metadata. Depends on [IRSx](https:/

2. install the requirements with `pip install -r requirements.txt`. This is Django 2, so only python3 is supported.

3. copy the irsdb/local\_settings.py-example file to irsdb\/local_settings.py and edit it to reflect your database settings.
3. copy the irsdb/irsdb/local\_settings.py-example file to irsdb/irsdb/local_settings.py and edit it to reflect your database settings.


### Part 2: Add the metadata
Expand Down
31 changes: 31 additions & 0 deletions irsdb/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
export IRSX_CACHE_DIRECTORY=.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a Makefile I made for building the whole db.


YEARS = 2014 2015 2016 2017 2018 2019

all : $(patsubst %,filing_%,$(YEARS))

initialize_db :
python manage.py makemigrations metadata
python manage.py migrate metadata
python manage.py load_metadata
python manage.py makemigrations filing
python manage.py migrate filing
python manage.py makemigrations return
python manage.py migrate return
touch $@

.PRECIOUS : $(patsubst %,CSV/index_%.csv,$(YEARS))
$(IRSX_CACHE_DIRECTORY)/CSV/index_%.csv : initialize_db
python manage.py enter_yearly_submissions $*

irs-990-form/CSV/index_2014.csv : initialize_db
wget -O $@ -N https://s3.amazonaws.com/irs-form-990/$(notdir $@)
perl -i.bak -p -e 's/SILVERCREST ASSET ,AMAGEMENT/SILVERCREST ASSET MANAGEMENT/g' $@
python manage.py enter_yearly_submissions 2014

download_filings_% :
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a lot faster to download the files using this the aws-cli and then run 'load_filings' then have load filings be responsible also for downloading.

On the downside, this requires folks to have an AWS account.

aws s3 sync s3://irs-form-990 $(IRSX_CACHE_DIRECTORY)/XML --exclude "*" --include "$**.xml" --no-progress | pv -l > /dev/null

filing_% : $(IRSX_CACHE_DIRECTORY)/CSV/index_%.csv download_filings_%
python manage.py load_filings $*

30 changes: 15 additions & 15 deletions irsdb/irsdb/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,21 +56,21 @@

ROOT_URLCONF = 'irsdb.urls'

# TEMPLATES = [
# {
# 'BACKEND': 'django.template.backends.django.DjangoTemplates',
# 'DIRS': [],
# 'APP_DIRS': False,
# 'OPTIONS': {
# 'context_processors': [
# 'django.template.context_processors.debug',
# 'django.template.context_processors.request',
# 'django.contrib.auth.context_processors.auth',
# 'django.contrib.messages.context_processors.messages',
# ],
# },
# },
# ]
TEMPLATES = [
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needed to be uncommented in order for any of the management commands to run.

{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': False,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]

WSGI_APPLICATION = 'irsdb.wsgi.application'

Expand Down
6 changes: 5 additions & 1 deletion irsdb/metadata/management/commands/load_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def reload_variables(self, *args, **options):
# row['canonical_version'] = CANONICAL_VERSION
#else:
# row['is_canonical'] = False
row['versions'] = row['versions'].split(';')
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is for using the github version of irsx.

Variable.objects.create(**row)
print("Total Variables %s" % i)

Expand All @@ -39,6 +40,7 @@ def reload_groups(self, *args, **options):
infile = open(infile, 'r')
reader = csv.DictReader(infile)
for i, row in enumerate(reader):
row['versions'] = row['versions'].split(';')
try:
if row['headless'] == '':
row['headless'] = None
Expand Down Expand Up @@ -73,6 +75,7 @@ def reload_line_numbers(self, *args, **options):
infile = open(infile, 'r')
reader = csv.DictReader(infile)
for i, row in enumerate(reader):
row['versions'] = row['versions'].split(';')
if i%REPORT_COUNT == 0:
print("Created %s rows" % i)
LineNumber.objects.create(**row)
Expand All @@ -85,6 +88,7 @@ def reload_descriptions(self, *args, **options):
infile = open(infile, 'r')
reader = csv.DictReader(infile)
for i, row in enumerate(reader):
row['versions'] = row['versions'].split(';')
if i%REPORT_COUNT == 0:
print("Created %s rows" % i)
Description.objects.create(**row)
Expand All @@ -97,4 +101,4 @@ def handle(self, *args, **options):
self.reload_groups()
self.reload_schedule_parts()
self.reload_line_numbers()
self.reload_descriptions()
self.reload_descriptions()
16 changes: 7 additions & 9 deletions irsdb/metadata/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from django.db import models
from django.contrib.postgres.fields import ArrayField
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i imagine you may not want to tie yourself to postgres. there is a portable version of jsonfield that could be used here.


# Base for import of metadata csv files
class IRSxBase(models.Model):
Expand All @@ -18,10 +19,9 @@ class Variable(IRSxBase):
db_type = models.CharField(max_length=63, blank=True, null=True, help_text="db type", editable=False)
line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS line number. Missing in returnheader", editable=False)
description = models.TextField(help_text="IRS-supplied description, from .xsd. ")
version_start = models.TextField(help_text="Start year", null=True)
version_end = models.TextField(help_text="End year", null=True)
is_canonical = models.NullBooleanField(help_text="", default=False)
canonical_version = models.CharField(max_length=16, blank=True, null=True, help_text="canonical_version", editable=False)
versions = ArrayField(models.CharField(max_length=10, blank=False))

def get_absolute_url(self):
return ("/metadata/variable/%s-%s.html" % (self.db_table, self.db_name))
Expand All @@ -31,8 +31,7 @@ class Group(IRSxBase):
line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False)
description = models.TextField(help_text="IRS-supplied description, from .xsd. ")
headless = models.NullBooleanField(help_text="", default=False)
version_start = models.TextField(help_text="Start year", null=True)
version_end = models.TextField(help_text="End year", null=True)
versions = ArrayField(models.CharField(max_length=10, blank=False))

def get_absolute_url(self):
return ("/metadata/groups/%s.html" % self.db_name)
Expand All @@ -50,12 +49,11 @@ def get_absolute_url(self):

class LineNumber(models.Model):
xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath?
version_start = models.TextField(help_text="Start year", null=True)
version_end = models.TextField(help_text="End year", null=True)
line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False)
line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False)
versions = ArrayField(models.CharField(max_length=10, blank=False))


class Description(models.Model):
xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath?
version_start = models.TextField(help_text="Start year", null=True)
version_end = models.TextField(help_text="End year", null=True)
description = models.TextField(help_text="description")
versions = ArrayField(models.CharField(max_length=10, blank=False))
4 changes: 2 additions & 2 deletions irsdb/metadata/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# The base of the file system
try:
FILE_SYSTEM_BASE = settings.FILE_SYSTEM_BASE
except ImportError:
except AttributeError:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the correct exception catcher.

FILE_SYSTEM_BASE = ''
# When set to true will 'cache' a baked version of the page
# To run a full bake, run a 'scrape' of every page that needs update
Expand Down Expand Up @@ -228,4 +228,4 @@ def show_forms(request):
}
if BAKE_OUT:
bake(request, template, context)
return render(request, template, context)
return render(request, template, context)
38 changes: 22 additions & 16 deletions irsdb/return/management/commands/load_filings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
from irsx.settings import INDEX_DIRECTORY
from irsx.file_utils import stream_download
from irsx.xmlrunner import XMLRunner
from irsx.filing import InvalidXMLException

# this is how many we process; there's a separate batch size
# in model accumulator for how many are processed
BATCH_SIZE = 1000


class Command(BaseCommand):
help = '''
Enter the filings, one by one.
Expand All @@ -37,31 +37,38 @@ def setup(self):

def process_sked(self, sked):
""" Enter just one schedule """
#print("Processing schedule %s" % sked['schedule_name'])
#self.stdout.write("Processing schedule %s" % sked['schedule_name'])
for part in sked['schedule_parts'].keys():
partname = part
partdata = sked['schedule_parts'][part]
#print("part %s %s" % (partname, partdata))
#self.stdout.write("part %s %s" % (partname, partdata))
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just using the django command guidance here.


self.accumulator.add_model(partname, partdata)

for groupname in sked['groups'].keys():
for groupdata in sked['groups'][groupname]:
#print("group %s %s" % (groupname, groupdata) )
#self.stdout.write("group %s %s" % (groupname, groupdata) )
self.accumulator.add_model(groupname, groupdata)


def run_filing(self, filing):
object_id = filing.object_id
print("run_filing %s" % object_id)

parsed_filing = self.xml_runner.run_filing(object_id)
self.stdout.write("run_filing %s" % object_id)

# if we get a bad xml file, delete the file and retry once
try:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was useful for me.

parsed_filing = self.xml_runner.run_filing(object_id)
except InvalidXMLException as e:
os.remove(e.filepath)
parsed_filing = self.xml_runner.run_filing(object_id)

if not parsed_filing:
print("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row))
self.stderr.write("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row))
return None

schedule_list = parsed_filing.list_schedules()
#print("sked list is %s" % schedule_list)
#self.stdout.write("sked list is %s" % schedule_list)

result = parsed_filing.get_result()

Expand All @@ -75,9 +82,8 @@ def run_filing(self, filing):

if keyerrors:
# If we find keyerrors--xpaths that are missing from our spec, note it
print("Key error %s")
has_keyerrors = len(keyerrors) > 0
print("keyerror: %s" % keyerrors)
self.stderr.write("keyerror: %s" % keyerrors)
filing.error_details = str(keyerrors)
filing.key_error_count = len(keyerrors)
filing.is_error = has_keyerrors
Expand All @@ -87,7 +93,7 @@ def run_filing(self, filing):
for sked in result:
self.process_sked(sked)
else:
print("Filing not parsed %s " % object_id)
self.stderr.write("Filing not parsed %s " % object_id)


def handle(self, *args, **options):
Expand All @@ -96,14 +102,14 @@ def handle(self, *args, **options):
if year not in [2014, 2015, 2016, 2017, 2018, 2019]:
raise RuntimeError("Illegal year `%s`. Please enter a year between 2014 and 2019" % year)

print("Running filings during year %s" % year)
self.stdout.write("Running filings during year %s" % year)
self.setup()

process_count = 0
while True:
filings=Filing.objects.filter(submission_year=year).exclude(parse_complete=True)[:100]
if not filings:
print("Done")
self.stdout.write("Done")
break

object_id_list = [f.object_id for f in filings]
Expand All @@ -112,14 +118,14 @@ def handle(self, *args, **options):
Filing.objects.filter(object_id__in=object_id_list).update(parse_started=True)

for filing in filings:
#print("Handling id %s" % filing.object_id)
#self.stdout.write("Handling id %s" % filing.object_id)
self.run_filing(filing)
process_count += 1
if process_count % 1000 == 0:
print("Handled %s filings" % process_count)
self.stdout.write("Handled %s filings" % process_count)

# commit anything that's left
self.accumulator.commit_all()
# record that all are complete
Filing.objects.filter(object_id__in=object_id_list).update(process_time=datetime.now(), parse_complete=True)
print("Processed a total of %s filings" % process_count)
self.stdout.write("Processed a total of %s filings" % process_count)