diff --git a/README.md b/README.md index dbd8d90..55b555b 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Django app to consume and store 990 data and metadata. Depends on [IRSx](https:/ 2. install the requirements with `pip install -r requirements.txt`. This is Django 2, so only python3 is supported. -3. copy the irsdb/local\_settings.py-example file to irsdb\/local_settings.py and edit it to reflect your database settings. +3. copy the irsdb/irsdb/local\_settings.py-example file to irsdb/irsdb/local_settings.py and edit it to reflect your database settings. ### Part 2: Add the metadata diff --git a/irsdb/Makefile b/irsdb/Makefile new file mode 100644 index 0000000..1599554 --- /dev/null +++ b/irsdb/Makefile @@ -0,0 +1,31 @@ +export IRSX_CACHE_DIRECTORY=. + +YEARS = 2014 2015 2016 2017 2018 2019 + +all : $(patsubst %,filing_%,$(YEARS)) + +initialize_db : + python manage.py makemigrations metadata + python manage.py migrate metadata + python manage.py load_metadata + python manage.py makemigrations filing + python manage.py migrate filing + python manage.py makemigrations return + python manage.py migrate return + touch $@ + +.PRECIOUS : $(patsubst %,CSV/index_%.csv,$(YEARS)) +$(IRSX_CACHE_DIRECTORY)/CSV/index_%.csv : initialize_db + python manage.py enter_yearly_submissions $* + +irs-990-form/CSV/index_2014.csv : initialize_db + wget -O $@ -N https://s3.amazonaws.com/irs-form-990/$(notdir $@) + perl -i.bak -p -e 's/SILVERCREST ASSET ,AMAGEMENT/SILVERCREST ASSET MANAGEMENT/g' $@ + python manage.py enter_yearly_submissions 2014 + +download_filings_% : + aws s3 sync s3://irs-form-990 $(IRSX_CACHE_DIRECTORY)/XML --exclude "*" --include "$**.xml" --no-progress | pv -l > /dev/null + +filing_% : $(IRSX_CACHE_DIRECTORY)/CSV/index_%.csv download_filings_% + python manage.py load_filings $* + diff --git a/irsdb/irsdb/settings.py b/irsdb/irsdb/settings.py index 3e662a0..7679ec0 100644 --- a/irsdb/irsdb/settings.py +++ b/irsdb/irsdb/settings.py @@ -56,21 +56,21 @@ ROOT_URLCONF = 'irsdb.urls' -# TEMPLATES = [ -# { -# 'BACKEND': 'django.template.backends.django.DjangoTemplates', -# 'DIRS': [], -# 'APP_DIRS': False, -# 'OPTIONS': { -# 'context_processors': [ -# 'django.template.context_processors.debug', -# 'django.template.context_processors.request', -# 'django.contrib.auth.context_processors.auth', -# 'django.contrib.messages.context_processors.messages', -# ], -# }, -# }, -# ] +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': False, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] WSGI_APPLICATION = 'irsdb.wsgi.application' diff --git a/irsdb/metadata/management/commands/load_metadata.py b/irsdb/metadata/management/commands/load_metadata.py index d54c776..76cc6a9 100644 --- a/irsdb/metadata/management/commands/load_metadata.py +++ b/irsdb/metadata/management/commands/load_metadata.py @@ -29,6 +29,7 @@ def reload_variables(self, *args, **options): # row['canonical_version'] = CANONICAL_VERSION #else: # row['is_canonical'] = False + row['versions'] = row['versions'].split(';') Variable.objects.create(**row) print("Total Variables %s" % i) @@ -39,6 +40,7 @@ def reload_groups(self, *args, **options): infile = open(infile, 'r') reader = csv.DictReader(infile) for i, row in enumerate(reader): + row['versions'] = row['versions'].split(';') try: if row['headless'] == '': row['headless'] = None @@ -73,6 +75,7 @@ def reload_line_numbers(self, *args, **options): infile = open(infile, 'r') reader = csv.DictReader(infile) for i, row in enumerate(reader): + row['versions'] = row['versions'].split(';') if i%REPORT_COUNT == 0: print("Created %s rows" % i) LineNumber.objects.create(**row) @@ -85,6 +88,7 @@ def reload_descriptions(self, *args, **options): infile = open(infile, 'r') reader = csv.DictReader(infile) for i, row in enumerate(reader): + row['versions'] = row['versions'].split(';') if i%REPORT_COUNT == 0: print("Created %s rows" % i) Description.objects.create(**row) @@ -97,4 +101,4 @@ def handle(self, *args, **options): self.reload_groups() self.reload_schedule_parts() self.reload_line_numbers() - self.reload_descriptions() \ No newline at end of file + self.reload_descriptions() diff --git a/irsdb/metadata/models.py b/irsdb/metadata/models.py index 96ca790..e7c32dd 100644 --- a/irsdb/metadata/models.py +++ b/irsdb/metadata/models.py @@ -1,4 +1,5 @@ from django.db import models +from django.contrib.postgres.fields import ArrayField # Base for import of metadata csv files class IRSxBase(models.Model): @@ -18,10 +19,9 @@ class Variable(IRSxBase): db_type = models.CharField(max_length=63, blank=True, null=True, help_text="db type", editable=False) line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS line number. Missing in returnheader", editable=False) description = models.TextField(help_text="IRS-supplied description, from .xsd. ") - version_start = models.TextField(help_text="Start year", null=True) - version_end = models.TextField(help_text="End year", null=True) is_canonical = models.NullBooleanField(help_text="", default=False) canonical_version = models.CharField(max_length=16, blank=True, null=True, help_text="canonical_version", editable=False) + versions = ArrayField(models.CharField(max_length=10, blank=False)) def get_absolute_url(self): return ("/metadata/variable/%s-%s.html" % (self.db_table, self.db_name)) @@ -31,8 +31,7 @@ class Group(IRSxBase): line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False) description = models.TextField(help_text="IRS-supplied description, from .xsd. ") headless = models.NullBooleanField(help_text="", default=False) - version_start = models.TextField(help_text="Start year", null=True) - version_end = models.TextField(help_text="End year", null=True) + versions = ArrayField(models.CharField(max_length=10, blank=False)) def get_absolute_url(self): return ("/metadata/groups/%s.html" % self.db_name) @@ -50,12 +49,11 @@ def get_absolute_url(self): class LineNumber(models.Model): xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath? - version_start = models.TextField(help_text="Start year", null=True) - version_end = models.TextField(help_text="End year", null=True) - line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False) + line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False) + versions = ArrayField(models.CharField(max_length=10, blank=False)) + class Description(models.Model): xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath? - version_start = models.TextField(help_text="Start year", null=True) - version_end = models.TextField(help_text="End year", null=True) description = models.TextField(help_text="description") + versions = ArrayField(models.CharField(max_length=10, blank=False)) diff --git a/irsdb/metadata/views.py b/irsdb/metadata/views.py index aaf3da7..1d1df24 100644 --- a/irsdb/metadata/views.py +++ b/irsdb/metadata/views.py @@ -21,7 +21,7 @@ # The base of the file system try: FILE_SYSTEM_BASE = settings.FILE_SYSTEM_BASE -except ImportError: +except AttributeError: FILE_SYSTEM_BASE = '' # When set to true will 'cache' a baked version of the page # To run a full bake, run a 'scrape' of every page that needs update @@ -228,4 +228,4 @@ def show_forms(request): } if BAKE_OUT: bake(request, template, context) - return render(request, template, context) \ No newline at end of file + return render(request, template, context) diff --git a/irsdb/return/management/commands/load_filings.py b/irsdb/return/management/commands/load_filings.py index ca972c0..6511638 100644 --- a/irsdb/return/management/commands/load_filings.py +++ b/irsdb/return/management/commands/load_filings.py @@ -12,12 +12,12 @@ from irsx.settings import INDEX_DIRECTORY from irsx.file_utils import stream_download from irsx.xmlrunner import XMLRunner +from irsx.filing import InvalidXMLException # this is how many we process; there's a separate batch size # in model accumulator for how many are processed BATCH_SIZE = 1000 - class Command(BaseCommand): help = ''' Enter the filings, one by one. @@ -37,31 +37,38 @@ def setup(self): def process_sked(self, sked): """ Enter just one schedule """ - #print("Processing schedule %s" % sked['schedule_name']) + #self.stdout.write("Processing schedule %s" % sked['schedule_name']) for part in sked['schedule_parts'].keys(): partname = part partdata = sked['schedule_parts'][part] - #print("part %s %s" % (partname, partdata)) + #self.stdout.write("part %s %s" % (partname, partdata)) self.accumulator.add_model(partname, partdata) for groupname in sked['groups'].keys(): for groupdata in sked['groups'][groupname]: - #print("group %s %s" % (groupname, groupdata) ) + #self.stdout.write("group %s %s" % (groupname, groupdata) ) self.accumulator.add_model(groupname, groupdata) def run_filing(self, filing): object_id = filing.object_id - print("run_filing %s" % object_id) - parsed_filing = self.xml_runner.run_filing(object_id) + self.stdout.write("run_filing %s" % object_id) + + # if we get a bad xml file, delete the file and retry once + try: + parsed_filing = self.xml_runner.run_filing(object_id) + except InvalidXMLException as e: + os.remove(e.filepath) + parsed_filing = self.xml_runner.run_filing(object_id) + if not parsed_filing: - print("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row)) + self.stderr.write("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row)) return None schedule_list = parsed_filing.list_schedules() - #print("sked list is %s" % schedule_list) + #self.stdout.write("sked list is %s" % schedule_list) result = parsed_filing.get_result() @@ -75,9 +82,8 @@ def run_filing(self, filing): if keyerrors: # If we find keyerrors--xpaths that are missing from our spec, note it - print("Key error %s") has_keyerrors = len(keyerrors) > 0 - print("keyerror: %s" % keyerrors) + self.stderr.write("keyerror: %s" % keyerrors) filing.error_details = str(keyerrors) filing.key_error_count = len(keyerrors) filing.is_error = has_keyerrors @@ -87,7 +93,7 @@ def run_filing(self, filing): for sked in result: self.process_sked(sked) else: - print("Filing not parsed %s " % object_id) + self.stderr.write("Filing not parsed %s " % object_id) def handle(self, *args, **options): @@ -96,14 +102,14 @@ def handle(self, *args, **options): if year not in [2014, 2015, 2016, 2017, 2018, 2019]: raise RuntimeError("Illegal year `%s`. Please enter a year between 2014 and 2019" % year) - print("Running filings during year %s" % year) + self.stdout.write("Running filings during year %s" % year) self.setup() process_count = 0 while True: filings=Filing.objects.filter(submission_year=year).exclude(parse_complete=True)[:100] if not filings: - print("Done") + self.stdout.write("Done") break object_id_list = [f.object_id for f in filings] @@ -112,14 +118,14 @@ def handle(self, *args, **options): Filing.objects.filter(object_id__in=object_id_list).update(parse_started=True) for filing in filings: - #print("Handling id %s" % filing.object_id) + #self.stdout.write("Handling id %s" % filing.object_id) self.run_filing(filing) process_count += 1 if process_count % 1000 == 0: - print("Handled %s filings" % process_count) + self.stdout.write("Handled %s filings" % process_count) # commit anything that's left self.accumulator.commit_all() # record that all are complete Filing.objects.filter(object_id__in=object_id_list).update(process_time=datetime.now(), parse_complete=True) - print("Processed a total of %s filings" % process_count) + self.stdout.write("Processed a total of %s filings" % process_count)