-
Notifications
You must be signed in to change notification settings - Fork 14
adjustments for building DB and using newer version of irsx #24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| export IRSX_CACHE_DIRECTORY=. | ||
|
|
||
| YEARS = 2014 2015 2016 2017 2018 2019 | ||
|
|
||
| all : $(patsubst %,filing_%,$(YEARS)) | ||
|
|
||
| initialize_db : | ||
| python manage.py makemigrations metadata | ||
| python manage.py migrate metadata | ||
| python manage.py load_metadata | ||
| python manage.py makemigrations filing | ||
| python manage.py migrate filing | ||
| python manage.py makemigrations return | ||
| python manage.py migrate return | ||
| touch $@ | ||
|
|
||
| .PRECIOUS : $(patsubst %,CSV/index_%.csv,$(YEARS)) | ||
| $(IRSX_CACHE_DIRECTORY)/CSV/index_%.csv : initialize_db | ||
| python manage.py enter_yearly_submissions $* | ||
|
|
||
| irs-990-form/CSV/index_2014.csv : initialize_db | ||
| wget -O $@ -N https://s3.amazonaws.com/irs-form-990/$(notdir $@) | ||
| perl -i.bak -p -e 's/SILVERCREST ASSET ,AMAGEMENT/SILVERCREST ASSET MANAGEMENT/g' $@ | ||
| python manage.py enter_yearly_submissions 2014 | ||
|
|
||
| download_filings_% : | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a lot faster to download the files using this the aws-cli and then run 'load_filings' then have load filings be responsible also for downloading. On the downside, this requires folks to have an AWS account. |
||
| aws s3 sync s3://irs-form-990 $(IRSX_CACHE_DIRECTORY)/XML --exclude "*" --include "$**.xml" --no-progress | pv -l > /dev/null | ||
|
|
||
| filing_% : $(IRSX_CACHE_DIRECTORY)/CSV/index_%.csv download_filings_% | ||
| python manage.py load_filings $* | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -56,21 +56,21 @@ | |
|
|
||
| ROOT_URLCONF = 'irsdb.urls' | ||
|
|
||
| # TEMPLATES = [ | ||
| # { | ||
| # 'BACKEND': 'django.template.backends.django.DjangoTemplates', | ||
| # 'DIRS': [], | ||
| # 'APP_DIRS': False, | ||
| # 'OPTIONS': { | ||
| # 'context_processors': [ | ||
| # 'django.template.context_processors.debug', | ||
| # 'django.template.context_processors.request', | ||
| # 'django.contrib.auth.context_processors.auth', | ||
| # 'django.contrib.messages.context_processors.messages', | ||
| # ], | ||
| # }, | ||
| # }, | ||
| # ] | ||
| TEMPLATES = [ | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This needed to be uncommented in order for any of the management commands to run. |
||
| { | ||
| 'BACKEND': 'django.template.backends.django.DjangoTemplates', | ||
| 'DIRS': [], | ||
| 'APP_DIRS': False, | ||
| 'OPTIONS': { | ||
| 'context_processors': [ | ||
| 'django.template.context_processors.debug', | ||
| 'django.template.context_processors.request', | ||
| 'django.contrib.auth.context_processors.auth', | ||
| 'django.contrib.messages.context_processors.messages', | ||
| ], | ||
| }, | ||
| }, | ||
| ] | ||
|
|
||
| WSGI_APPLICATION = 'irsdb.wsgi.application' | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,6 +29,7 @@ def reload_variables(self, *args, **options): | |
| # row['canonical_version'] = CANONICAL_VERSION | ||
| #else: | ||
| # row['is_canonical'] = False | ||
| row['versions'] = row['versions'].split(';') | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is for using the github version of irsx. |
||
| Variable.objects.create(**row) | ||
| print("Total Variables %s" % i) | ||
|
|
||
|
|
@@ -39,6 +40,7 @@ def reload_groups(self, *args, **options): | |
| infile = open(infile, 'r') | ||
| reader = csv.DictReader(infile) | ||
| for i, row in enumerate(reader): | ||
| row['versions'] = row['versions'].split(';') | ||
| try: | ||
| if row['headless'] == '': | ||
| row['headless'] = None | ||
|
|
@@ -73,6 +75,7 @@ def reload_line_numbers(self, *args, **options): | |
| infile = open(infile, 'r') | ||
| reader = csv.DictReader(infile) | ||
| for i, row in enumerate(reader): | ||
| row['versions'] = row['versions'].split(';') | ||
| if i%REPORT_COUNT == 0: | ||
| print("Created %s rows" % i) | ||
| LineNumber.objects.create(**row) | ||
|
|
@@ -85,6 +88,7 @@ def reload_descriptions(self, *args, **options): | |
| infile = open(infile, 'r') | ||
| reader = csv.DictReader(infile) | ||
| for i, row in enumerate(reader): | ||
| row['versions'] = row['versions'].split(';') | ||
| if i%REPORT_COUNT == 0: | ||
| print("Created %s rows" % i) | ||
| Description.objects.create(**row) | ||
|
|
@@ -97,4 +101,4 @@ def handle(self, *args, **options): | |
| self.reload_groups() | ||
| self.reload_schedule_parts() | ||
| self.reload_line_numbers() | ||
| self.reload_descriptions() | ||
| self.reload_descriptions() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| from django.db import models | ||
| from django.contrib.postgres.fields import ArrayField | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i imagine you may not want to tie yourself to postgres. there is a portable version of jsonfield that could be used here. |
||
|
|
||
| # Base for import of metadata csv files | ||
| class IRSxBase(models.Model): | ||
|
|
@@ -18,10 +19,9 @@ class Variable(IRSxBase): | |
| db_type = models.CharField(max_length=63, blank=True, null=True, help_text="db type", editable=False) | ||
| line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS line number. Missing in returnheader", editable=False) | ||
| description = models.TextField(help_text="IRS-supplied description, from .xsd. ") | ||
| version_start = models.TextField(help_text="Start year", null=True) | ||
| version_end = models.TextField(help_text="End year", null=True) | ||
| is_canonical = models.NullBooleanField(help_text="", default=False) | ||
| canonical_version = models.CharField(max_length=16, blank=True, null=True, help_text="canonical_version", editable=False) | ||
| versions = ArrayField(models.CharField(max_length=10, blank=False)) | ||
|
|
||
| def get_absolute_url(self): | ||
| return ("/metadata/variable/%s-%s.html" % (self.db_table, self.db_name)) | ||
|
|
@@ -31,8 +31,7 @@ class Group(IRSxBase): | |
| line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False) | ||
| description = models.TextField(help_text="IRS-supplied description, from .xsd. ") | ||
| headless = models.NullBooleanField(help_text="", default=False) | ||
| version_start = models.TextField(help_text="Start year", null=True) | ||
| version_end = models.TextField(help_text="End year", null=True) | ||
| versions = ArrayField(models.CharField(max_length=10, blank=False)) | ||
|
|
||
| def get_absolute_url(self): | ||
| return ("/metadata/groups/%s.html" % self.db_name) | ||
|
|
@@ -50,12 +49,11 @@ def get_absolute_url(self): | |
|
|
||
| class LineNumber(models.Model): | ||
| xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath? | ||
| version_start = models.TextField(help_text="Start year", null=True) | ||
| version_end = models.TextField(help_text="End year", null=True) | ||
| line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False) | ||
| line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False) | ||
| versions = ArrayField(models.CharField(max_length=10, blank=False)) | ||
|
|
||
|
|
||
| class Description(models.Model): | ||
| xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath? | ||
| version_start = models.TextField(help_text="Start year", null=True) | ||
| version_end = models.TextField(help_text="End year", null=True) | ||
| description = models.TextField(help_text="description") | ||
| versions = ArrayField(models.CharField(max_length=10, blank=False)) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,7 @@ | |
| # The base of the file system | ||
| try: | ||
| FILE_SYSTEM_BASE = settings.FILE_SYSTEM_BASE | ||
| except ImportError: | ||
| except AttributeError: | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is the correct exception catcher. |
||
| FILE_SYSTEM_BASE = '' | ||
| # When set to true will 'cache' a baked version of the page | ||
| # To run a full bake, run a 'scrape' of every page that needs update | ||
|
|
@@ -228,4 +228,4 @@ def show_forms(request): | |
| } | ||
| if BAKE_OUT: | ||
| bake(request, template, context) | ||
| return render(request, template, context) | ||
| return render(request, template, context) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,12 +12,12 @@ | |
| from irsx.settings import INDEX_DIRECTORY | ||
| from irsx.file_utils import stream_download | ||
| from irsx.xmlrunner import XMLRunner | ||
| from irsx.filing import InvalidXMLException | ||
|
|
||
| # this is how many we process; there's a separate batch size | ||
| # in model accumulator for how many are processed | ||
| BATCH_SIZE = 1000 | ||
|
|
||
|
|
||
| class Command(BaseCommand): | ||
| help = ''' | ||
| Enter the filings, one by one. | ||
|
|
@@ -37,31 +37,38 @@ def setup(self): | |
|
|
||
| def process_sked(self, sked): | ||
| """ Enter just one schedule """ | ||
| #print("Processing schedule %s" % sked['schedule_name']) | ||
| #self.stdout.write("Processing schedule %s" % sked['schedule_name']) | ||
| for part in sked['schedule_parts'].keys(): | ||
| partname = part | ||
| partdata = sked['schedule_parts'][part] | ||
| #print("part %s %s" % (partname, partdata)) | ||
| #self.stdout.write("part %s %s" % (partname, partdata)) | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just using the django command guidance here. |
||
|
|
||
| self.accumulator.add_model(partname, partdata) | ||
|
|
||
| for groupname in sked['groups'].keys(): | ||
| for groupdata in sked['groups'][groupname]: | ||
| #print("group %s %s" % (groupname, groupdata) ) | ||
| #self.stdout.write("group %s %s" % (groupname, groupdata) ) | ||
| self.accumulator.add_model(groupname, groupdata) | ||
|
|
||
|
|
||
| def run_filing(self, filing): | ||
| object_id = filing.object_id | ||
| print("run_filing %s" % object_id) | ||
|
|
||
| parsed_filing = self.xml_runner.run_filing(object_id) | ||
| self.stdout.write("run_filing %s" % object_id) | ||
|
|
||
| # if we get a bad xml file, delete the file and retry once | ||
| try: | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this was useful for me. |
||
| parsed_filing = self.xml_runner.run_filing(object_id) | ||
| except InvalidXMLException as e: | ||
| os.remove(e.filepath) | ||
| parsed_filing = self.xml_runner.run_filing(object_id) | ||
|
|
||
| if not parsed_filing: | ||
| print("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row)) | ||
| self.stderr.write("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row)) | ||
| return None | ||
|
|
||
| schedule_list = parsed_filing.list_schedules() | ||
| #print("sked list is %s" % schedule_list) | ||
| #self.stdout.write("sked list is %s" % schedule_list) | ||
|
|
||
| result = parsed_filing.get_result() | ||
|
|
||
|
|
@@ -75,9 +82,8 @@ def run_filing(self, filing): | |
|
|
||
| if keyerrors: | ||
| # If we find keyerrors--xpaths that are missing from our spec, note it | ||
| print("Key error %s") | ||
| has_keyerrors = len(keyerrors) > 0 | ||
| print("keyerror: %s" % keyerrors) | ||
| self.stderr.write("keyerror: %s" % keyerrors) | ||
| filing.error_details = str(keyerrors) | ||
| filing.key_error_count = len(keyerrors) | ||
| filing.is_error = has_keyerrors | ||
|
|
@@ -87,7 +93,7 @@ def run_filing(self, filing): | |
| for sked in result: | ||
| self.process_sked(sked) | ||
| else: | ||
| print("Filing not parsed %s " % object_id) | ||
| self.stderr.write("Filing not parsed %s " % object_id) | ||
|
|
||
|
|
||
| def handle(self, *args, **options): | ||
|
|
@@ -96,14 +102,14 @@ def handle(self, *args, **options): | |
| if year not in [2014, 2015, 2016, 2017, 2018, 2019]: | ||
| raise RuntimeError("Illegal year `%s`. Please enter a year between 2014 and 2019" % year) | ||
|
|
||
| print("Running filings during year %s" % year) | ||
| self.stdout.write("Running filings during year %s" % year) | ||
| self.setup() | ||
|
|
||
| process_count = 0 | ||
| while True: | ||
| filings=Filing.objects.filter(submission_year=year).exclude(parse_complete=True)[:100] | ||
| if not filings: | ||
| print("Done") | ||
| self.stdout.write("Done") | ||
| break | ||
|
|
||
| object_id_list = [f.object_id for f in filings] | ||
|
|
@@ -112,14 +118,14 @@ def handle(self, *args, **options): | |
| Filing.objects.filter(object_id__in=object_id_list).update(parse_started=True) | ||
|
|
||
| for filing in filings: | ||
| #print("Handling id %s" % filing.object_id) | ||
| #self.stdout.write("Handling id %s" % filing.object_id) | ||
| self.run_filing(filing) | ||
| process_count += 1 | ||
| if process_count % 1000 == 0: | ||
| print("Handled %s filings" % process_count) | ||
| self.stdout.write("Handled %s filings" % process_count) | ||
|
|
||
| # commit anything that's left | ||
| self.accumulator.commit_all() | ||
| # record that all are complete | ||
| Filing.objects.filter(object_id__in=object_id_list).update(process_time=datetime.now(), parse_complete=True) | ||
| print("Processed a total of %s filings" % process_count) | ||
| self.stdout.write("Processed a total of %s filings" % process_count) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a Makefile I made for building the whole db.