jsfenfen · fgregg · Jul 17, 2019 · fgregg · Jul 17, 2019 · fgregg
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ Django app to consume and store 990 data and metadata. Depends on [IRSx](https:/
 
 2. install the requirements with `pip install -r requirements.txt`. This is Django 2, so only python3 is supported.
 
-3. copy the irsdb/local\_settings.py-example file to irsdb\/local_settings.py and edit it to reflect your database settings.
+3. copy the irsdb/irsdb/local\_settings.py-example file to irsdb/irsdb/local_settings.py and edit it to reflect your database settings.
 
 
 ### Part 2: Add the metadata

diff --git a/irsdb/Makefile b/irsdb/Makefile
@@ -0,0 +1,31 @@
+export IRSX_CACHE_DIRECTORY=.
+
+YEARS = 2014 2015 2016 2017 2018 2019
+
+all : $(patsubst %,filing_%,$(YEARS))
+
+initialize_db :
+	python manage.py makemigrations metadata
+	python manage.py migrate metadata
+	python manage.py load_metadata
+	python manage.py makemigrations filing
+	python manage.py migrate filing
+	python manage.py makemigrations return
+	python manage.py migrate return
+	touch $@
+
+.PRECIOUS : $(patsubst %,CSV/index_%.csv,$(YEARS))
+$(IRSX_CACHE_DIRECTORY)/CSV/index_%.csv : initialize_db
+	python manage.py enter_yearly_submissions $*
+
+irs-990-form/CSV/index_2014.csv : initialize_db
+	wget -O $@ -N https://s3.amazonaws.com/irs-form-990/$(notdir $@)
+	perl -i.bak -p -e 's/SILVERCREST ASSET ,AMAGEMENT/SILVERCREST ASSET MANAGEMENT/g' $@
+	python manage.py enter_yearly_submissions 2014
+
+download_filings_% :
+	aws s3 sync s3://irs-form-990 $(IRSX_CACHE_DIRECTORY)/XML --exclude "*" --include "$**.xml" --no-progress | pv -l > /dev/null
+
+filing_% : $(IRSX_CACHE_DIRECTORY)/CSV/index_%.csv download_filings_%
+	python manage.py load_filings $* 
+
diff --git a/irsdb/irsdb/settings.py b/irsdb/irsdb/settings.py
@@ -56,21 +56,21 @@
 
 ROOT_URLCONF = 'irsdb.urls'
 
-# TEMPLATES = [
-#     {
-#         'BACKEND': 'django.template.backends.django.DjangoTemplates',
-#         'DIRS': [],
-#         'APP_DIRS': False,
-#         'OPTIONS': {
-#             'context_processors': [
-#                 'django.template.context_processors.debug',
-#                 'django.template.context_processors.request',
-#                 'django.contrib.auth.context_processors.auth',
-#                 'django.contrib.messages.context_processors.messages',
-#             ],
-#         },
-#     },
-# ]
+TEMPLATES = [
+    {
+        'BACKEND': 'django.template.backends.django.DjangoTemplates',
+        'DIRS': [],
+        'APP_DIRS': False,
+        'OPTIONS': {
+            'context_processors': [
+                'django.template.context_processors.debug',
+                'django.template.context_processors.request',
+                'django.contrib.auth.context_processors.auth',
+                'django.contrib.messages.context_processors.messages',
+            ],
+        },
+    },
+]
 
 WSGI_APPLICATION = 'irsdb.wsgi.application'
 

diff --git a/irsdb/metadata/management/commands/load_metadata.py b/irsdb/metadata/management/commands/load_metadata.py
@@ -29,6 +29,7 @@ def reload_variables(self, *args, **options):
             #    row['canonical_version'] = CANONICAL_VERSION
             #else:
             #    row['is_canonical'] = False
+            row['versions'] = row['versions'].split(';')
             Variable.objects.create(**row)
         print("Total Variables %s" % i)
 
@@ -39,6 +40,7 @@ def reload_groups(self, *args, **options):
         infile = open(infile, 'r')
         reader = csv.DictReader(infile)
         for i, row in enumerate(reader):
+            row['versions'] = row['versions'].split(';')
             try:
                 if row['headless'] == '':
                     row['headless'] = None
@@ -73,6 +75,7 @@ def reload_line_numbers(self, *args, **options):
         infile = open(infile, 'r')
         reader = csv.DictReader(infile)
         for i, row in enumerate(reader):
+            row['versions'] = row['versions'].split(';')
             if i%REPORT_COUNT == 0:
                 print("Created %s rows" % i)
             LineNumber.objects.create(**row)
@@ -85,6 +88,7 @@ def reload_descriptions(self, *args, **options):
         infile = open(infile, 'r')
         reader = csv.DictReader(infile)
         for i, row in enumerate(reader):
+            row['versions'] = row['versions'].split(';')
             if i%REPORT_COUNT == 0:
                 print("Created %s rows" % i)
             Description.objects.create(**row)
@@ -97,4 +101,4 @@ def handle(self, *args, **options):
         self.reload_groups()
         self.reload_schedule_parts()
         self.reload_line_numbers()
-        self.reload_descriptions()
+        self.reload_descriptions()
diff --git a/irsdb/metadata/models.py b/irsdb/metadata/models.py
@@ -1,4 +1,5 @@
 from django.db import models
+from django.contrib.postgres.fields import ArrayField
 
 # Base for import of metadata csv files
 class IRSxBase(models.Model):
@@ -18,10 +19,9 @@ class Variable(IRSxBase):
     db_type = models.CharField(max_length=63, blank=True, null=True, help_text="db type", editable=False)
     line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS line number. Missing in returnheader", editable=False)
     description = models.TextField(help_text="IRS-supplied description, from .xsd. ") 
-    version_start = models.TextField(help_text="Start year", null=True) 
-    version_end = models.TextField(help_text="End year", null=True) 
     is_canonical = models.NullBooleanField(help_text="", default=False) 
     canonical_version = models.CharField(max_length=16, blank=True, null=True, help_text="canonical_version", editable=False)
+    versions = ArrayField(models.CharField(max_length=10, blank=False))
 
     def get_absolute_url(self):
         return ("/metadata/variable/%s-%s.html" % (self.db_table, self.db_name))
@@ -31,8 +31,7 @@ class Group(IRSxBase):
     line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False) 
     description = models.TextField(help_text="IRS-supplied description, from .xsd. ") 
     headless = models.NullBooleanField(help_text="", default=False) 
-    version_start = models.TextField(help_text="Start year", null=True) 
-    version_end = models.TextField(help_text="End year", null=True) 
+    versions = ArrayField(models.CharField(max_length=10, blank=False))
 
     def get_absolute_url(self):
         return ("/metadata/groups/%s.html" % self.db_name)
@@ -50,12 +49,11 @@ def get_absolute_url(self):
 
 class LineNumber(models.Model):
     xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath? 
-    version_start = models.TextField(help_text="Start year", null=True) 
-    version_end = models.TextField(help_text="End year", null=True) 
-    line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False) 
+    line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False)
+    versions = ArrayField(models.CharField(max_length=10, blank=False))
+
 
 class Description(models.Model):
     xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath? 
-    version_start = models.TextField(help_text="Start year", null=True) 
-    version_end = models.TextField(help_text="End year", null=True) 
     description = models.TextField(help_text="description") 
+    versions = ArrayField(models.CharField(max_length=10, blank=False))
diff --git a/irsdb/metadata/views.py b/irsdb/metadata/views.py
@@ -21,7 +21,7 @@
 # The base of the file system
 try:
     FILE_SYSTEM_BASE = settings.FILE_SYSTEM_BASE
-except ImportError:
+except AttributeError:
     FILE_SYSTEM_BASE = ''
 # When set to true will 'cache' a baked version of the page
 # To run a full bake, run a 'scrape' of every page that needs update
@@ -228,4 +228,4 @@ def show_forms(request):
     }
     if BAKE_OUT:
         bake(request, template, context) 
-    return render(request, template, context)
+    return render(request, template, context)
diff --git a/irsdb/return/management/commands/load_filings.py b/irsdb/return/management/commands/load_filings.py
@@ -12,12 +12,12 @@
 from irsx.settings import INDEX_DIRECTORY
 from irsx.file_utils import stream_download
 from irsx.xmlrunner import XMLRunner
+from irsx.filing import InvalidXMLException
 
 # this is how many we process; there's a separate batch size
 # in model accumulator for how many are processed
 BATCH_SIZE = 1000
 
-
 class Command(BaseCommand):
     help = '''
     Enter the filings, one by one.
@@ -37,31 +37,38 @@ def setup(self):
 
     def process_sked(self, sked):
         """ Enter just one schedule """ 
-        #print("Processing schedule %s" % sked['schedule_name'])
+        #self.stdout.write("Processing schedule %s" % sked['schedule_name'])
         for part in sked['schedule_parts'].keys():
             partname = part
             partdata = sked['schedule_parts'][part]
-            #print("part %s %s" % (partname, partdata))
+            #self.stdout.write("part %s %s" % (partname, partdata))
 
             self.accumulator.add_model(partname, partdata)
 
         for groupname in sked['groups'].keys():
             for groupdata in sked['groups'][groupname]:
-                #print("group %s %s" % (groupname, groupdata) )
+                #self.stdout.write("group %s %s" % (groupname, groupdata) )
                 self.accumulator.add_model(groupname, groupdata)
 
 
     def run_filing(self, filing):
         object_id = filing.object_id
-        print("run_filing %s" % object_id)
 
-        parsed_filing = self.xml_runner.run_filing(object_id)
+        self.stdout.write("run_filing %s" % object_id)
+
+        # if we get a bad xml file, delete the file and retry once
+        try:
+            parsed_filing = self.xml_runner.run_filing(object_id)
+        except InvalidXMLException as e:
+            os.remove(e.filepath)
+            parsed_filing = self.xml_runner.run_filing(object_id)
+
         if not parsed_filing:
-            print("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row))
+            self.stderr.write("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row))
             return None
 
         schedule_list = parsed_filing.list_schedules()
-        #print("sked list is %s" % schedule_list)
+        #self.stdout.write("sked list is %s" % schedule_list)
 
         result = parsed_filing.get_result()
 
@@ -75,9 +82,8 @@ def run_filing(self, filing):
 
         if keyerrors:
             # If we find keyerrors--xpaths that are missing from our spec, note it
-            print("Key error %s")
             has_keyerrors = len(keyerrors) > 0
-            print("keyerror: %s" % keyerrors)
+            self.stderr.write("keyerror: %s" % keyerrors)
             filing.error_details = str(keyerrors)
             filing.key_error_count = len(keyerrors)
             filing.is_error = has_keyerrors
@@ -87,7 +93,7 @@ def run_filing(self, filing):
             for sked in result:
                 self.process_sked(sked)
         else:
-            print("Filing not parsed %s " % object_id)
+            self.stderr.write("Filing not parsed %s " % object_id)
 
 
     def handle(self, *args, **options):
@@ -96,14 +102,14 @@ def handle(self, *args, **options):
         if year not in [2014, 2015, 2016, 2017, 2018, 2019]:
             raise RuntimeError("Illegal year `%s`. Please enter a year between 2014 and 2019" % year)
 
-        print("Running filings during year %s" % year)
+        self.stdout.write("Running filings during year %s" % year)
         self.setup()
 
         process_count = 0
         while True:
             filings=Filing.objects.filter(submission_year=year).exclude(parse_complete=True)[:100]
             if not filings:
-                print("Done")
+                self.stdout.write("Done")
                 break
 
             object_id_list = [f.object_id for f in filings]
@@ -112,14 +118,14 @@ def handle(self, *args, **options):
             Filing.objects.filter(object_id__in=object_id_list).update(parse_started=True)
 
             for filing in filings:
-                #print("Handling id %s" % filing.object_id)
+                #self.stdout.write("Handling id %s" % filing.object_id)
                 self.run_filing(filing)
                 process_count += 1
                 if process_count % 1000 == 0:
-                    print("Handled %s filings" % process_count)
+                    self.stdout.write("Handled %s filings" % process_count)
 
             # commit anything that's left
             self.accumulator.commit_all()
             # record that all are complete
             Filing.objects.filter(object_id__in=object_id_list).update(process_time=datetime.now(), parse_complete=True)
-            print("Processed a total of %s filings" % process_count)
+            self.stdout.write("Processed a total of %s filings" % process_count)