Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib;
- import bs4;
- secFtpBase = 'ftp://ftp.sec.gov/edgar/';
- secUserName = 'anonymous';
- secPassWord = 'tmo8145@gmail.com';
- """
- Downloads the sec filing via ftp and returns a uncleaned html document object model (dom) as a str
- Cik is used to map a company to a unique id
- Accession Number is a number unique to the filing document
- """
- def downloadSecDocument(cik, accessionNum):
- documentFtpLink = secFtpBase + 'data/' + cik + '/' + accessionNum + '/' + accessionNum[0:11] + '-' + accessionNum[11:12] + '-' + accessionNum[12:];
- fileStr = requests.get(documentFtpLink, auth=(secUserName, secPassWord));
- """
- Returns the basic file data located at the top of the filing before the html document starts
- Returned data includes: accession number, acceptance datetime, form type, report period, company name, cik
- """
- def getBasicFileData(fileStr):
- fileLines = fileStr.iter_lines():
- acceptaceDateTimeStr = [21:];
- accessionNumber = [19:];
- submissionType = [28:];
- companyName =
- cik =
- reportPeriod = fileLine[];
- fileDate = fileLine[];
- """
- Main function for parsing a 10k/Q
- fileStr is an uncleaned html/css str containing
- """
- def parsefile(fileStr):
- fileDom = BeautifulSoup(fileStr);
- """
- Extracts tables from a beautiful soup obj and returns it in a dictionary form
- """
- def parseTables(fileDom):
- var tableObjs =fileDom.find_all('table');
- for tableObj in tableObjs:
- parseTable(tableObj);
- """
- Given a table in html format, creates a list of lists that represents the table
- """
- def parseTable(tableObj)
- var tableRows = tableObj.find_all('tr');
- #Need to check if row contains headers with <th> tags or data with <td tags>
- #Row might contain headers and data b/c of row headers
- for tableRow in tableRows:
- rowReaders = tableRow.find_all('th');
- rowData = tableRow.find_all('td');
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement