Skip to content

Commit

Permalink
Update Filing object, bump version. Add PyPi push action.
Browse files Browse the repository at this point in the history
  • Loading branch information
gaulinmp committed Dec 16, 2024
1 parent 9d84406 commit 9b1e63e
Show file tree
Hide file tree
Showing 10 changed files with 192 additions and 38 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# pyedgar

Python package for downloading EDGAR documents and data
Python package for downloading EDGAR documents and data.

[![PyPI version shields.io](https://img.shields.io/pypi/v/pyedgar.svg)](https://pypi.python.org/pypi/pyedgar/)
[![PyPI license](https://img.shields.io/pypi/l/pyedgar.svg)](https://pypi.python.org/pypi/pyedgar/)
[![PyPI pyversions](https://img.shields.io/pypi/pyversions/pyedgar.svg)](https://pypi.python.org/pypi/pyedgar/)
[![GitHub latest commit](https://badgen.net/github/last-commit/gaulinmp/pyedgar)](https://GitHub.com/gaulinmp/pyedgar/commit/)



## Usage
Expand Down
2 changes: 1 addition & 1 deletion pyedgar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"""

__title__ = 'pyedgar'
__version__ = '0.1.8'
__version__ = '0.1.9'
__version_info__ = tuple(int(i) for i in __version__.split("."))
__author__ = 'Mac Gaulin'
__license__ = 'MIT'
Expand Down
4 changes: 2 additions & 2 deletions pyedgar/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
KEEP_REGEX=
; User Agent for downloading, to keep the SEC happy
USER_AGENT=pyedgar feed download by YOUREMAIL@example.com, from code at https://github.com/gaulinmp/pyedgar
USER_AGENT=pyedgar feed download by YOUREMAIL@sec.gov, from code at https://github.com/gaulinmp/pyedgar
[Index]
; Index file settings
Expand Down Expand Up @@ -196,7 +196,7 @@ def get_config_file(extra_dirs=None):
"KEEP_REGEX": "",
"INDEX_DELIMITER": "\t",
"INDEX_EXTENSION": "tab",
"USER_AGENT": "pyedgar feed download by YOUREMAIL@example.com, from code at https://github.com/gaulinmp/pyedgar",
"USER_AGENT": "pyedgar feed download by YOUREMAIL@sec.gov, from code at https://github.com/gaulinmp/pyedgar",
}

CONFIG_FILE = get_config_file()
Expand Down
1 change: 1 addition & 0 deletions pyedgar/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def main(start_date=None, last_n_days=30, get_indices=False, get_feeds=False, us


def print_cache_status():
"""Prints out the last found cache files for feeds and indices."""
for i_date in reversed(list(utilities.iterate_dates(1995))):
_feedfile = config.get_feed_cache_path(i_date)
if os.path.exists(_feedfile):
Expand Down
165 changes: 145 additions & 20 deletions pyedgar/filing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,71 @@
"""
Base class for EDGAR filing.
Meant to be easily overridden. For example, to create a filing class that allows for easy extraction of
BeautifulSoup documents and local caching (e.g. if reading from edgar website):
```python
import os
from bs4 import BeautifulSoup
from IPython.display import display_html
import pyedgar
from pyedgar.utilities import htmlparse
class Filing(pyedgar.Filing):
# caches filings in a directory next to this file called data/cache
DATA_ROOT = "data"
@property
def cache_path(self):
return os.path.join(self.DATA_ROOT, 'cache', self.accession.split('-')[1], f"{self.accession}.txt")
def _cache_local(self, doc_to_cache):
try:
with open(self.cache_path, 'w') as fh:
fh.write(doc_to_cache)
except FileNotFoundError:
# ## directory doesn't exists, make it and recall. If the parent doesn't exist either
# (the cache folder) then it'll error out below, so no infinite recursion.
os.mkdir(os.path.dirname(self.cache_path))
return self._cache_local(doc_to_cache)
except Exception:
# I guess caching didn't work...
return doc_to_cache
def _post_init_hook(self, **kwargs):
self._local_cache = True
if os.path.exists(self.cache_path):
self._filing_local_path = self.cache_path
for k,v in kwargs.items():
if k not in self.__dict__:
setattr(self, k, v)
def _set_full_text(self):
_txt = super()._set_full_text()
try:
if not os.path.exists(self.cache_path):
self._cache_local(_txt)
except Exception:
pass
return _txt
def is_html(self, docnum=0, *args, **kwargs):
return htmlparse.is_html(self.documents[docnum]['full_text'], *args, **kwargs)
def soup(self, docnum=0, *args, **kwargs):
return BeautifulSoup(self.documents[docnum]['full_text'], *args, **kwargs)
def print(self, docnum=0):
'''jupyter notebook aware print function'''
if self.is_html(docnum):
display_html(self.documents[docnum]['full_text'], raw=True)
else:
print(self.documents[docnum]['full_text'])
```
:copyright: © 2021 by Mac Gaulin
:license: MIT, see LICENSE for more details.
"""
Expand All @@ -10,11 +75,14 @@
import re
import logging

try:
from bs4 import BeautifulSoup
except ImportError:
pass

from pyedgar import config
from pyedgar.utilities import edgarweb
from pyedgar.utilities import forms
from pyedgar.utilities import get_cik_acc, edgarweb, forms, localstore, htmlparse
from pyedgar.utilities.forms import FORMS
from pyedgar.utilities import localstore


class Filing(object):
Expand Down Expand Up @@ -61,7 +129,8 @@ def __init__(
flat_headers=True,
omit_duplicate_headers=False,
duplicate_headers_as_list=True,
**kwargs
read_kwargs=None,
**kwargs,
):
"""
Initialization sets CIK, Accession, and optionally
Expand All @@ -86,6 +155,7 @@ def __init__(
if True will return the header values as a list (e.g. ['5.02',
'5.07', '9.01']). If False will add _# to duplicate header names.
Default: True.
read_kwargs (dict, None): Dictionary passed as read args. Defaults to None.
Returns:
Filing object.
Expand All @@ -95,27 +165,55 @@ def __init__(
"""
self.__log = logging.getLogger("pyedgar.filing.Filing")

if accession is None:
try:
_ac = get_cik_acc(cik)
cik, accession = _ac['cik'], _ac['accession']
except TypeError as exc:
raise ValueError(f"CIK/Accession input not formatted as expected. Got: {cik}/{accession}") from exc

self._set_cik(cik)
self._set_accession(accession)

self._local_cache = use_cache if use_cache is not None else config.CACHE_FEED
self._web_fallback = web_fallback

self.read_args = kwargs
self.read_args = read_kwargs or {}
self.header_args = {
"flat": flat_headers,
"omit_duplicates": omit_duplicate_headers,
"add_int_to_name": not duplicate_headers_as_list,
}

self._post_init_hook(**kwargs)

def _post_init_hook(self, **kwargs):
"""
Post init hook, called at the end of init. Used for hooking into initialization and gets
all extra keyword arguments passed to init.
Empty by default, but used to do things like store data, handle local caching, etc. Example::
```python
def gvkeyFiling(pyedgar.filing.Filing):
def _post_init_hook(self, **kwargs):
self.gvkey = kwargs.get('gvkey', -1)
```
"""
return self

def __repr__(self):
return "<EDGAR filing ({}/{}) Headers:{}, Text:{}, Documents:{}>".format(
self.cik, self.accession, bool(self._headers), bool(self._full_text), bool(self._documents),
return (
f"<EDGAR filing ({self.cik}/{self.accession}) Loaded Headers:{bool(self._headers)}, "
f"Text:{bool(self._full_text)}, Documents:{bool(self._documents)}>"
)

def __str__(self):
return self.__repr__()

#===================================================================================================================
# Helper functions
#===================================================================================================================
def _set_cik(self, cik=None):
"""
Set cik on object, verifying format is CIK-like.
Expand All @@ -132,9 +230,9 @@ def _set_cik(self, cik=None):
try:
if cik is not None:
self._cik = int(cik)
except ValueError:
except ValueError as exc:
# They didn't pass in a CIK that looked like a number
raise ValueError("CIKs must be numeric variables," " you passed in {}".format(cik))
raise ValueError(f"CIKs must be numeric variables, you passed in {cik}") from exc

return self._cik

Expand All @@ -157,15 +255,14 @@ def _set_accession(self, accession=None):
try:
if accession and localstore.ACCESSION_RE.search(accession):
if len(accession) == 18:
self._accession = "{}-{}-{}".format(accession[:10], accession[10:12], accession[12:])
self._accession = f"{accession[:10]}-{accession[10:12]}-{accession[12:]}"
else:
self._accession = accession
except TypeError:
except TypeError as exc:
# They didn't pass in an accession that was a string.
raise ValueError(
"Accessions must be 18/20 character strings of format"
" ##########-##-######, you passed in {}".format(accession)
)
f"Accessions must be 18/20 character strings of format ##########-##-######, you passed in {accession}"
) from exc

return self.accession

Expand All @@ -190,14 +287,15 @@ def _set_full_text(self):
try:
self._full_text = forms.get_full_filing(self.path, **self.read_args)
return self._full_text
except FileNotFoundError:
msg = "Filing not found for CIK:{} / Accession:{}".format(self.cik, self.accession)
except FileNotFoundError as exc:
msg = f"Filing not found for CIK:{self.cik} / Accession:{self.accession}"
self.__log.debug(msg)
if not self._web_fallback:
raise FileNotFoundError(msg)
raise FileNotFoundError(msg) from exc

self.__log.debug("Downloading from EDGAR web: %d/%s", self.cik, self.accession)
self._full_text = edgarweb.download_form_from_web(self.cik, self.accession)
if self._web_fallback:
self.__log.debug("Downloading from EDGAR web: %d/%s", self.cik, self.accession)
self._full_text = edgarweb.download_form_from_web(self.cik, self.accession)

return self._full_text

Expand Down Expand Up @@ -273,6 +371,9 @@ def _set_documents(self):

return self._documents

#===================================================================================================================
# Properties
#===================================================================================================================
cik = property(fget=lambda self: self._cik, fset=_set_cik)
accession = property(fget=lambda self: self._accession, fset=_set_accession)

Expand Down Expand Up @@ -347,7 +448,8 @@ def type_exact(self):
Returns:
str: Full type string of the document from the header.
"""
return self._type or self._set_type()
self.type
return self._type_exact

@property
def documents(self):
Expand All @@ -361,6 +463,10 @@ def documents(self):
"""
return self._documents or self._set_documents()


#===================================================================================================================
# Method functions
#===================================================================================================================
def get_sequence_number(self, sequence_number):
"""
Access exhibits (or main filing) by sequence number (1-indexed).
Expand Down Expand Up @@ -429,3 +535,22 @@ def get_documents_by_tag(self, tag_name, search_string, regex=False, flags=0):
ret.append(doc)

return ret


class HTMLFiling(Filing):
"""
Filing with convenience functions for dealing with HTML documents. Adds the classes:
* `is_html()`: Boolean flag for whether specified document (`docnum=X`) is HTML format
* `soup()`: Returns a BeautifulSoup object of specified document (note: does not check is_html)
"""
def is_html(self, docnum=0, **kwargs):
return htmlparse.is_html(self.documents[docnum]['full_text'], **kwargs)

def soup(self, *args, docnum=0, **kwargs):
"""
Returns the document at `docnum` as a BeautifulSoup object.
Args/Kwargs are passed along to BeautifulSoup, so docnum must be specified as a keyword argument,
e.g.: `filing.soup('lxml', docnum=1)`
"""
return BeautifulSoup(self.documents[docnum]['full_text'], *args, **kwargs)
2 changes: 1 addition & 1 deletion pyedgar/pyedgar.conf
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ KEEP_ALL=False
KEEP_REGEX=10-[KQ]|10[KQ]SB|8-K|(?:14A$)|13[FDG]

; User Agent for downloading, to keep the SEC happy
USER_AGENT=pyedgar feed download by [email protected], from code at https://github.com/gaulinmp/pyedgar
USER_AGENT=pyedgar feed download by [email protected], from code at https://github.com/gaulinmp/pyedgar

[Index]
; Index file settings
Expand Down
4 changes: 2 additions & 2 deletions pyedgar/utilities/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _get_thing(thing, box_o_things):
return None

if len(args):
# Here, look for cik & acc, put the rest into
# Here, look for cik & acc, put the rest into the args/kwargs key
for arg in args:
if arg is None:
continue
Expand Down Expand Up @@ -110,7 +110,7 @@ def _get_thing(thing, box_o_things):

if len(kwargs):
try:
# int of float because pandas might convert cik to float
# int of float of input, because pandas might convert cik to float
_ret["cik"] = int(float(kwargs["cik"]))
except (ValueError, TypeError, KeyError):
# Wasn't the right type/parseable, or no cik key
Expand Down
2 changes: 1 addition & 1 deletion pyedgar/utilities/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def get_full_filing(file_path, encoding=None, errors="ignore"):
FileNotFoundError: Raised if file doesn't exist.
"""
if not os.path.exists(file_path):
raise FileNotFoundError("File {} does not exist.".format(file_path))
raise FileNotFoundError(f"File {file_path} does not exist.")

with open(file_path, encoding=encoding or ENCODING_INPUT, errors=errors or "ignore") as fh:
return fh.read()
Expand Down
Loading

0 comments on commit 9b1e63e

Please sign in to comment.