Logo

Evan Trock

LinkedIn
Resume
GitHub

import fnmatch
import glob
import os
import re
from time import sleep
from zipfile import ZipFile

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from utils.near_regex import * 
from tqdm import tqdm  
from sec_edgar_downloader import Downloader
from requests_html import HTMLSession
import shutil

os.makedirs("output", exist_ok=True)
#load in the s&p 500 data csv
sp500_file = 'inputs/sp500_2022.csv'

# get it if we haven't 
if not os.path.exists(sp500_file):
    # 2022 dec version of page
    url = 'https://en.wikipedia.org/w/index.php?title=List_of_S%26P_500_companies&oldid=1130173030'
    pd.read_html(url)[0].to_csv(sp500_file,index=False)

# load and look at it
sp500 = pd.read_csv(sp500_file)    
dl = Downloader("Evan Trock", 
                "ert226@lehigh.edu",
                "10k_files")
if not os.path.exists('10k_files/10k_files.zip'):
    
    for cik in tqdm(sp500['CIK'][:503]):
         
        firm_folder = f'10k_files/sec-edgar-filings/{str(cik).zfill(10)}/'  # str(cik).zfill(10)   means that CIK 1234 becomes 0000001234

        # if I haven't downloaded any HTML for this firm (len=0 files on this pattern), do so
        # you could make this more precise and only look for filings during 2022 (if you were downloading across many years)
        
        if len(glob.glob(firm_folder + '/10-K/*/*.html')) == 0:
            
            dl.get("10-K", cik, 
                   limit=1,                  # get the latest filing within window
                   after="2022-01-01",       # does this download filings ON 1/1 or nah? (check)
                   before="2022-12-31",      # does this download filings ON 12/31 or nah? (check)
                   download_details =True    # download the html 
            ) 
    
        # delete the txt files as we go!!!
        # files are of the form: folder/10-K/*/*.txt
        for txt_f in glob.glob(firm_folder + '/10-K/*/*.txt'):
            os.remove(txt_f) 
100%|██████████| 503/503 [05:53<00:00,  1.42it/s]
files = glob.glob('10k_files/sec-edgar-filings/*/10-K/*/*.html')
files = [file.replace('\\', '/') for file in files]
#f'We have {len(files)} HTML files for {len(sp500["CIK"])} firms'
files
['10k_files/sec-edgar-filings/0000001800/10-K/0001104659-22-025141/primary-document.html',
 '10k_files/sec-edgar-filings/0000002488/10-K/0000002488-22-000016/primary-document.html',
 '10k_files/sec-edgar-filings/0000002969/10-K/0000002969-22-000054/primary-document.html',
 '10k_files/sec-edgar-filings/0000004127/10-K/0000004127-22-000038/primary-document.html',
 '10k_files/sec-edgar-filings/0000004281/10-K/0000004281-22-000004/primary-document.html',
 '10k_files/sec-edgar-filings/0000004447/10-K/0001628280-22-004524/primary-document.html',
 '10k_files/sec-edgar-filings/0000004904/10-K/0000004904-22-000024/primary-document.html',
 '10k_files/sec-edgar-filings/0000004962/10-K/0000004962-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000004977/10-K/0000004977-22-000058/primary-document.html',
 '10k_files/sec-edgar-filings/0000005272/10-K/0001104659-22-024701/primary-document.html',
 '10k_files/sec-edgar-filings/0000006201/10-K/0000006201-22-000026/primary-document.html',
 '10k_files/sec-edgar-filings/0000006281/10-K/0000006281-22-000250/primary-document.html',
 '10k_files/sec-edgar-filings/0000006951/10-K/0000006951-22-000043/primary-document.html',
 '10k_files/sec-edgar-filings/0000007084/10-K/0000007084-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000008670/10-K/0000008670-22-000038/primary-document.html',
 '10k_files/sec-edgar-filings/0000008818/10-K/0001193125-22-049910/primary-document.html',
 '10k_files/sec-edgar-filings/0000009389/10-K/0001558370-22-001251/primary-document.html',
 '10k_files/sec-edgar-filings/0000010456/10-K/0001628280-22-003432/primary-document.html',
 '10k_files/sec-edgar-filings/0000010795/10-K/0001628280-22-030686/primary-document.html',
 '10k_files/sec-edgar-filings/0000011544/10-K/0000011544-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000012208/10-K/0000012208-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0000012927/10-K/0000012927-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0000014272/10-K/0000014272-22-000051/primary-document.html',
 '10k_files/sec-edgar-filings/0000014693/10-K/0000014693-22-000069/primary-document.html',
 '10k_files/sec-edgar-filings/0000016732/10-K/0000016732-22-000093/primary-document.html',
 '10k_files/sec-edgar-filings/0000016918/10-K/0000016918-22-000069/primary-document.html',
 '10k_files/sec-edgar-filings/0000018230/10-K/0000018230-22-000050/primary-document.html',
 '10k_files/sec-edgar-filings/0000018926/10-K/0000018926-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000019617/10-K/0000019617-22-000272/primary-document.html',
 '10k_files/sec-edgar-filings/0000020286/10-K/0000020286-22-000012/primary-document.html',
 '10k_files/sec-edgar-filings/0000021076/10-K/0000021076-22-000026/primary-document.html',
 '10k_files/sec-edgar-filings/0000021344/10-K/0000021344-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000021665/10-K/0000021665-22-000003/primary-document.html',
 '10k_files/sec-edgar-filings/0000023217/10-K/0001437749-22-017530/primary-document.html',
 '10k_files/sec-edgar-filings/0000024545/10-K/0000024545-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0000024741/10-K/0001437749-22-003247/primary-document.html',
 '10k_files/sec-edgar-filings/0000026172/10-K/0000026172-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000027419/10-K/0000027419-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000027904/10-K/0000027904-22-000003/primary-document.html',
 '10k_files/sec-edgar-filings/0000028412/10-K/0000028412-22-000067/primary-document.html',
 '10k_files/sec-edgar-filings/0000029534/10-K/0001558370-22-003921/primary-document.html',
 '10k_files/sec-edgar-filings/0000029905/10-K/0000029905-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000029989/10-K/0000029989-22-000012/primary-document.html',
 '10k_files/sec-edgar-filings/0000031462/10-K/0001558370-22-002059/primary-document.html',
 '10k_files/sec-edgar-filings/0000031791/10-K/0000031791-22-000003/primary-document.html',
 '10k_files/sec-edgar-filings/0000032604/10-K/0000032604-22-000041/primary-document.html',
 '10k_files/sec-edgar-filings/0000033185/10-K/0000033185-22-000014/primary-document.html',
 '10k_files/sec-edgar-filings/0000033213/10-K/0000033213-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000034088/10-K/0000034088-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0000034903/10-K/0000034903-22-000023/primary-document.html',
 '10k_files/sec-edgar-filings/0000035527/10-K/0000035527-22-000119/primary-document.html',
 '10k_files/sec-edgar-filings/0000036104/10-K/0001193125-22-048709/primary-document.html',
 '10k_files/sec-edgar-filings/0000036270/10-K/0001564590-22-005400/primary-document.html',
 '10k_files/sec-edgar-filings/0000037785/10-K/0000037785-22-000025/primary-document.html',
 '10k_files/sec-edgar-filings/0000037996/10-K/0000037996-22-000013/primary-document.html',
 '10k_files/sec-edgar-filings/0000038777/10-K/0000038777-22-000198/primary-document.html',
 '10k_files/sec-edgar-filings/0000040533/10-K/0000040533-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000040545/10-K/0000040545-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000040704/10-K/0001193125-22-185257/primary-document.html',
 '10k_files/sec-edgar-filings/0000040987/10-K/0000040987-22-000013/primary-document.html',
 '10k_files/sec-edgar-filings/0000045012/10-K/0000045012-22-000013/primary-document.html',
 '10k_files/sec-edgar-filings/0000046080/10-K/0000046080-22-000023/primary-document.html',
 '10k_files/sec-edgar-filings/0000047111/10-K/0000047111-22-000017/primary-document.html',
 '10k_files/sec-edgar-filings/0000047217/10-K/0000047217-22-000068/primary-document.html',
 '10k_files/sec-edgar-filings/0000048465/10-K/0000048465-22-000051/primary-document.html',
 '10k_files/sec-edgar-filings/0000049071/10-K/0000049071-22-000017/primary-document.html',
 '10k_files/sec-edgar-filings/0000049196/10-K/0000049196-22-000023/primary-document.html',
 '10k_files/sec-edgar-filings/0000049826/10-K/0000049826-22-000006/primary-document.html',
 '10k_files/sec-edgar-filings/0000050863/10-K/0000050863-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000051143/10-K/0001558370-22-001584/primary-document.html',
 '10k_files/sec-edgar-filings/0000051253/10-K/0000051253-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000051434/10-K/0000051434-22-000016/primary-document.html',
 '10k_files/sec-edgar-filings/0000051644/10-K/0000051644-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0000052988/10-K/0000052988-22-000111/primary-document.html',
 '10k_files/sec-edgar-filings/0000055067/10-K/0001628280-22-003345/primary-document.html',
 '10k_files/sec-edgar-filings/0000055785/10-K/0000055785-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0000056873/10-K/0001558370-22-004595/primary-document.html',
 '10k_files/sec-edgar-filings/0000059478/10-K/0000059478-22-000068/primary-document.html',
 '10k_files/sec-edgar-filings/0000059558/10-K/0000059558-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000060086/10-K/0000060086-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000060667/10-K/0000060667-22-000038/primary-document.html',
 '10k_files/sec-edgar-filings/0000062709/10-K/0000062709-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000062996/10-K/0000062996-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0000063754/10-K/0000063754-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0000063908/10-K/0000063908-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0000064040/10-K/0000064040-22-000055/primary-document.html',
 '10k_files/sec-edgar-filings/0000064803/10-K/0000064803-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000065984/10-K/0000065984-22-000017/primary-document.html',
 '10k_files/sec-edgar-filings/0000066740/10-K/0000066740-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0000068505/10-K/0000068505-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0000070858/10-K/0000070858-22-000062/primary-document.html',
 '10k_files/sec-edgar-filings/0000072331/10-K/0000072331-22-000185/primary-document.html',
 '10k_files/sec-edgar-filings/0000072741/10-K/0000072741-22-000015/primary-document.html',
 '10k_files/sec-edgar-filings/0000072903/10-K/0000072903-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0000072971/10-K/0000072971-22-000096/primary-document.html',
 '10k_files/sec-edgar-filings/0000073124/10-K/0000073124-22-000071/primary-document.html',
 '10k_files/sec-edgar-filings/0000073309/10-K/0001564590-22-007679/primary-document.html',
 '10k_files/sec-edgar-filings/0000074208/10-K/0000074208-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0000075362/10-K/0001564590-22-006237/primary-document.html',
 '10k_files/sec-edgar-filings/0000075677/10-K/0000950170-22-001913/primary-document.html',
 '10k_files/sec-edgar-filings/0000076334/10-K/0000076334-22-000034/primary-document.html',
 '10k_files/sec-edgar-filings/0000077360/10-K/0000077360-22-000006/primary-document.html',
 '10k_files/sec-edgar-filings/0000077476/10-K/0000077476-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0000078003/10-K/0000078003-22-000027/primary-document.html',
 '10k_files/sec-edgar-filings/0000079282/10-K/0000950170-22-001654/primary-document.html',
 '10k_files/sec-edgar-filings/0000079879/10-K/0000079879-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000080424/10-K/0000080424-22-000064/primary-document.html',
 '10k_files/sec-edgar-filings/0000080661/10-K/0000080661-22-000046/primary-document.html',
 '10k_files/sec-edgar-filings/0000084839/10-K/0000084839-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0000086312/10-K/0000086312-22-000013/primary-document.html',
 '10k_files/sec-edgar-filings/0000087347/10-K/0001564590-22-002421/primary-document.html',
 '10k_files/sec-edgar-filings/0000089800/10-K/0000089800-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000091142/10-K/0000091142-22-000028/primary-document.html',
 '10k_files/sec-edgar-filings/0000091419/10-K/0000091419-22-000049/primary-document.html',
 '10k_files/sec-edgar-filings/0000091440/10-K/0000091440-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0000091576/10-K/0000091576-22-000029/primary-document.html',
 '10k_files/sec-edgar-filings/0000092122/10-K/0000092122-22-000003/primary-document.html',
 '10k_files/sec-edgar-filings/0000092230/10-K/0000092230-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000092380/10-K/0000092380-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000093410/10-K/0000093410-22-000019/primary-document.html',
 '10k_files/sec-edgar-filings/0000093556/10-K/0000093556-22-000015/primary-document.html',
 '10k_files/sec-edgar-filings/0000093751/10-K/0000093751-22-000424/primary-document.html',
 '10k_files/sec-edgar-filings/0000096021/10-K/0000096021-22-000151/primary-document.html',
 '10k_files/sec-edgar-filings/0000096943/10-K/0000096943-22-000016/primary-document.html',
 '10k_files/sec-edgar-filings/0000097210/10-K/0001193125-22-049828/primary-document.html',
 '10k_files/sec-edgar-filings/0000097476/10-K/0000097476-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000097745/10-K/0000097745-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0000100493/10-K/0000100493-22-000097/primary-document.html',
 '10k_files/sec-edgar-filings/0000100517/10-K/0000100517-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000100885/10-K/0001437749-22-002494/primary-document.html',
 '10k_files/sec-edgar-filings/0000101778/10-K/0000101778-22-000016/primary-document.html',
 '10k_files/sec-edgar-filings/0000101829/10-K/0000101829-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0000103379/10-K/0000103379-22-000006/primary-document.html',
 '10k_files/sec-edgar-filings/0000104169/10-K/0000104169-22-000012/primary-document.html',
 '10k_files/sec-edgar-filings/0000105770/10-K/0001628280-22-003342/primary-document.html',
 '10k_files/sec-edgar-filings/0000106040/10-K/0000106040-22-000055/primary-document.html',
 '10k_files/sec-edgar-filings/0000106535/10-K/0001564590-22-005707/primary-document.html',
 '10k_files/sec-edgar-filings/0000106640/10-K/0000106640-22-000015/primary-document.html',
 '10k_files/sec-edgar-filings/0000107263/10-K/0000107263-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000109198/10-K/0000109198-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000109380/10-K/0000109380-22-000072/primary-document.html',
 '10k_files/sec-edgar-filings/0000200406/10-K/0000200406-22-000022/primary-document.html',
 '10k_files/sec-edgar-filings/0000202058/10-K/0000202058-22-000015/primary-document.html',
 '10k_files/sec-edgar-filings/0000217346/10-K/0000217346-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0000277135/10-K/0000277135-22-000012/primary-document.html',
 '10k_files/sec-edgar-filings/0000277948/10-K/0000277948-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000310158/10-K/0000310158-22-000003/primary-document.html',
 '10k_files/sec-edgar-filings/0000310764/10-K/0000310764-22-000028/primary-document.html',
 '10k_files/sec-edgar-filings/0000313616/10-K/0000313616-22-000061/primary-document.html',
 '10k_files/sec-edgar-filings/0000313927/10-K/0001564590-22-005528/primary-document.html',
 '10k_files/sec-edgar-filings/0000315189/10-K/0001558370-22-018703/primary-document.html',
 '10k_files/sec-edgar-filings/0000315213/10-K/0000315213-22-000018/primary-document.html',
 '10k_files/sec-edgar-filings/0000315293/10-K/0001628280-22-003180/primary-document.html',
 '10k_files/sec-edgar-filings/0000316709/10-K/0000316709-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000318154/10-K/0000318154-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0000319201/10-K/0000319201-22-000023/primary-document.html',
 '10k_files/sec-edgar-filings/0000320187/10-K/0000320187-22-000038/primary-document.html',
 '10k_files/sec-edgar-filings/0000320193/10-K/0000320193-22-000108/primary-document.html',
 '10k_files/sec-edgar-filings/0000320335/10-K/0000320335-22-000006/primary-document.html',
 '10k_files/sec-edgar-filings/0000352541/10-K/0000352541-22-000020/primary-document.html',
 '10k_files/sec-edgar-filings/0000352915/10-K/0001564590-22-006717/primary-document.html',
 '10k_files/sec-edgar-filings/0000354190/10-K/0001564590-22-005714/primary-document.html',
 '10k_files/sec-edgar-filings/0000354950/10-K/0000354950-22-000070/primary-document.html',
 '10k_files/sec-edgar-filings/0000701985/10-K/0000701985-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000702165/10-K/0000702165-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000707549/10-K/0000707549-22-000107/primary-document.html',
 '10k_files/sec-edgar-filings/0000711404/10-K/0000711404-22-000053/primary-document.html',
 '10k_files/sec-edgar-filings/0000712515/10-K/0000712515-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0000713676/10-K/0000713676-22-000019/primary-document.html',
 '10k_files/sec-edgar-filings/0000715957/10-K/0001564590-22-006589/primary-document.html',
 '10k_files/sec-edgar-filings/0000718877/10-K/0001628280-22-003992/primary-document.html',
 '10k_files/sec-edgar-filings/0000719739/10-K/0000719739-22-000023/primary-document.html',
 '10k_files/sec-edgar-filings/0000720005/10-K/0000720005-22-000066/primary-document.html',
 '10k_files/sec-edgar-filings/0000721371/10-K/0000721371-22-000058/primary-document.html',
 '10k_files/sec-edgar-filings/0000723125/10-K/0000723125-22-000048/primary-document.html',
 '10k_files/sec-edgar-filings/0000723254/10-K/0000723254-22-000019/primary-document.html',
 '10k_files/sec-edgar-filings/0000723531/10-K/0000950170-22-012734/primary-document.html',
 '10k_files/sec-edgar-filings/0000726728/10-K/0000726728-22-000046/primary-document.html',
 '10k_files/sec-edgar-filings/0000728535/10-K/0001437749-22-004457/primary-document.html',
 '10k_files/sec-edgar-filings/0000731766/10-K/0000731766-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000731802/10-K/0000731802-22-000037/primary-document.html',
 '10k_files/sec-edgar-filings/0000732712/10-K/0000732712-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000732717/10-K/0000732717-22-000015/primary-document.html',
 '10k_files/sec-edgar-filings/0000740260/10-K/0000740260-22-000057/primary-document.html',
 '10k_files/sec-edgar-filings/0000745732/10-K/0000745732-22-000014/primary-document.html',
 '10k_files/sec-edgar-filings/0000746515/10-K/0001564590-22-010381/primary-document.html',
 '10k_files/sec-edgar-filings/0000749251/10-K/0000749251-22-000006/primary-document.html',
 '10k_files/sec-edgar-filings/0000753308/10-K/0000753308-22-000014/primary-document.html',
 '10k_files/sec-edgar-filings/0000759944/10-K/0000759944-22-000025/primary-document.html',
 '10k_files/sec-edgar-filings/0000764180/10-K/0000764180-22-000019/primary-document.html',
 '10k_files/sec-edgar-filings/0000764478/10-K/0000764478-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000764622/10-K/0000764622-22-000014/primary-document.html',
 '10k_files/sec-edgar-filings/0000765880/10-K/0001628280-22-002117/primary-document.html',
 '10k_files/sec-edgar-filings/0000766421/10-K/0000766421-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000766704/10-K/0000766704-22-000013/primary-document.html',
 '10k_files/sec-edgar-filings/0000769397/10-K/0000769397-22-000019/primary-document.html',
 '10k_files/sec-edgar-filings/0000773840/10-K/0000773840-22-000018/primary-document.html',
 '10k_files/sec-edgar-filings/0000779152/10-K/0000779152-22-000076/primary-document.html',
 '10k_files/sec-edgar-filings/0000783325/10-K/0000107815-22-000116/primary-document.html',
 '10k_files/sec-edgar-filings/0000788784/10-K/0001628280-22-003860/primary-document.html',
 '10k_files/sec-edgar-filings/0000789019/10-K/0001564590-22-026876/primary-document.html',
 '10k_files/sec-edgar-filings/0000789570/10-K/0000789570-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0000796343/10-K/0000796343-22-000032/primary-document.html',
 '10k_files/sec-edgar-filings/0000797468/10-K/0000797468-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000798354/10-K/0000798354-22-000004/primary-document.html',
 '10k_files/sec-edgar-filings/0000804328/10-K/0000804328-22-000021/primary-document.html',
 '10k_files/sec-edgar-filings/0000811156/10-K/0000811156-22-000048/primary-document.html',
 '10k_files/sec-edgar-filings/0000813672/10-K/0000813672-22-000012/primary-document.html',
 '10k_files/sec-edgar-filings/0000813828/10-K/0000813828-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0000814453/10-K/0000814453-22-000017/primary-document.html',
 '10k_files/sec-edgar-filings/0000815097/10-K/0000815097-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000815556/10-K/0000815556-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000818479/10-K/0000818479-22-000015/primary-document.html',
 '10k_files/sec-edgar-filings/0000820027/10-K/0000820027-22-000016/primary-document.html',
 '10k_files/sec-edgar-filings/0000820313/10-K/0001558370-22-000961/primary-document.html',
 '10k_files/sec-edgar-filings/0000821189/10-K/0000821189-22-000017/primary-document.html',
 '10k_files/sec-edgar-filings/0000822416/10-K/0000822416-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000823768/10-K/0001558370-22-001179/primary-document.html',
 '10k_files/sec-edgar-filings/0000827052/10-K/0000827052-22-000006/primary-document.html',
 '10k_files/sec-edgar-filings/0000827054/10-K/0000827054-22-000094/primary-document.html',
 '10k_files/sec-edgar-filings/0000829224/10-K/0000829224-22-000058/primary-document.html',
 '10k_files/sec-edgar-filings/0000831001/10-K/0000831001-22-000036/primary-document.html',
 '10k_files/sec-edgar-filings/0000831259/10-K/0000831259-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000832101/10-K/0000832101-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000833444/10-K/0000833444-22-000043/primary-document.html',
 '10k_files/sec-edgar-filings/0000842023/10-K/0001558370-22-013935/primary-document.html',
 '10k_files/sec-edgar-filings/0000849399/10-K/0000849399-22-000013/primary-document.html',
 '10k_files/sec-edgar-filings/0000851968/10-K/0000851968-22-000026/primary-document.html',
 '10k_files/sec-edgar-filings/0000857005/10-K/0000950170-22-025211/primary-document.html',
 '10k_files/sec-edgar-filings/0000858470/10-K/0000858470-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000858877/10-K/0000858877-22-000013/primary-document.html',
 '10k_files/sec-edgar-filings/0000859737/10-K/0000859737-22-000022/primary-document.html',
 '10k_files/sec-edgar-filings/0000860730/10-K/0001193125-22-046707/primary-document.html',
 '10k_files/sec-edgar-filings/0000860731/10-K/0000860731-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0000864749/10-K/0000864749-22-000044/primary-document.html',
 '10k_files/sec-edgar-filings/0000865752/10-K/0001104659-22-028182/primary-document.html',
 '10k_files/sec-edgar-filings/0000866787/10-K/0001558370-22-015239/primary-document.html',
 '10k_files/sec-edgar-filings/0000872589/10-K/0001804220-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000874716/10-K/0000874716-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000874761/10-K/0000874761-22-000022/primary-document.html',
 '10k_files/sec-edgar-filings/0000874766/10-K/0000874766-22-000019/primary-document.html',
 '10k_files/sec-edgar-filings/0000875045/10-K/0000875045-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000875320/10-K/0000875320-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000877212/10-K/0000877212-22-000026/primary-document.html',
 '10k_files/sec-edgar-filings/0000878927/10-K/0001564590-22-006303/primary-document.html',
 '10k_files/sec-edgar-filings/0000879101/10-K/0001437749-22-004700/primary-document.html',
 '10k_files/sec-edgar-filings/0000879169/10-K/0001558370-22-000902/primary-document.html',
 '10k_files/sec-edgar-filings/0000882095/10-K/0000882095-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0000882184/10-K/0000882184-22-000184/primary-document.html',
 '10k_files/sec-edgar-filings/0000882835/10-K/0000882835-22-000012/primary-document.html',
 '10k_files/sec-edgar-filings/0000883241/10-K/0000883241-22-000017/primary-document.html',
 '10k_files/sec-edgar-filings/0000884887/10-K/0000884887-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000885725/10-K/0000885725-22-000006/primary-document.html',
 '10k_files/sec-edgar-filings/0000886982/10-K/0001193125-22-052682/primary-document.html',
 '10k_files/sec-edgar-filings/0000891103/10-K/0000891103-22-000020/primary-document.html',
 '10k_files/sec-edgar-filings/0000895421/10-K/0000895421-22-000400/primary-document.html',
 '10k_files/sec-edgar-filings/0000896159/10-K/0000896159-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0000896878/10-K/0000896878-22-000028/primary-document.html',
 '10k_files/sec-edgar-filings/0000898173/10-K/0000898173-22-000012/primary-document.html',
 '10k_files/sec-edgar-filings/0000899051/10-K/0000899051-22-000015/primary-document.html',
 '10k_files/sec-edgar-filings/0000899689/10-K/0000899689-22-000006/primary-document.html',
 '10k_files/sec-edgar-filings/0000900075/10-K/0000900075-22-000050/primary-document.html',
 '10k_files/sec-edgar-filings/0000906107/10-K/0001564590-22-005566/primary-document.html',
 '10k_files/sec-edgar-filings/0000906163/10-K/0000906163-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000906345/10-K/0000906345-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000908255/10-K/0000908255-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0000909832/10-K/0000909832-22-000021/primary-document.html',
 '10k_files/sec-edgar-filings/0000910606/10-K/0000950170-22-001418/primary-document.html',
 '10k_files/sec-edgar-filings/0000912595/10-K/0000950170-22-001423/primary-document.html',
 '10k_files/sec-edgar-filings/0000914208/10-K/0000914208-22-000319/primary-document.html',
 '10k_files/sec-edgar-filings/0000915389/10-K/0000915389-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0000915912/10-K/0000915912-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0000915913/10-K/0000915913-22-000027/primary-document.html',
 '10k_files/sec-edgar-filings/0000916076/10-K/0001564590-22-005965/primary-document.html',
 '10k_files/sec-edgar-filings/0000916365/10-K/0000916365-22-000049/primary-document.html',
 '10k_files/sec-edgar-filings/0000920148/10-K/0000920148-22-000015/primary-document.html',
 '10k_files/sec-edgar-filings/0000920522/10-K/0000920522-22-000019/primary-document.html',
 '10k_files/sec-edgar-filings/0000920760/10-K/0001628280-22-001450/primary-document.html',
 '10k_files/sec-edgar-filings/0000922224/10-K/0000922224-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0000927066/10-K/0000927066-22-000012/primary-document.html',
 '10k_files/sec-edgar-filings/0000927628/10-K/0000927628-22-000106/primary-document.html',
 '10k_files/sec-edgar-filings/0000927653/10-K/0000927653-22-000051/primary-document.html',
 '10k_files/sec-edgar-filings/0000935703/10-K/0000935703-22-000020/primary-document.html',
 '10k_files/sec-edgar-filings/0000936340/10-K/0000936340-22-000077/primary-document.html',
 '10k_files/sec-edgar-filings/0000936468/10-K/0000936468-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0000940944/10-K/0000940944-22-000042/primary-document.html',
 '10k_files/sec-edgar-filings/0000943452/10-K/0001628280-22-002997/primary-document.html',
 '10k_files/sec-edgar-filings/0000943819/10-K/0000943819-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0000945841/10-K/0000945841-22-000023/primary-document.html',
 '10k_files/sec-edgar-filings/0000946581/10-K/0001628280-22-014580/primary-document.html',
 '10k_files/sec-edgar-filings/0000947484/10-K/0000947484-22-000015/primary-document.html',
 '10k_files/sec-edgar-filings/0001000228/10-K/0001000228-22-000016/primary-document.html',
 '10k_files/sec-edgar-filings/0001000697/10-K/0001193125-22-051509/primary-document.html',
 '10k_files/sec-edgar-filings/0001001082/10-K/0001558370-22-001816/primary-document.html',
 '10k_files/sec-edgar-filings/0001001250/10-K/0001001250-22-000122/primary-document.html',
 '10k_files/sec-edgar-filings/0001002047/10-K/0000950170-22-011708/primary-document.html',
 '10k_files/sec-edgar-filings/0001002910/10-K/0001002910-22-000038/primary-document.html',
 '10k_files/sec-edgar-filings/0001004980/10-K/0001004980-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0001012100/10-K/0001628280-22-003294/primary-document.html',
 '10k_files/sec-edgar-filings/0001013237/10-K/0001013237-22-000159/primary-document.html',
 '10k_files/sec-edgar-filings/0001013462/10-K/0001013462-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0001013871/10-K/0001013871-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0001014473/10-K/0001014473-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001018724/10-K/0001018724-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0001020569/10-K/0001020569-22-000035/primary-document.html',
 '10k_files/sec-edgar-filings/0001022079/10-K/0001022079-22-000027/primary-document.html',
 '10k_files/sec-edgar-filings/0001022671/10-K/0001558370-22-002377/primary-document.html',
 '10k_files/sec-edgar-filings/0001024478/10-K/0001024478-22-000093/primary-document.html',
 '10k_files/sec-edgar-filings/0001031296/10-K/0001031296-22-000013/primary-document.html',
 '10k_files/sec-edgar-filings/0001032208/10-K/0001032208-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001034054/10-K/0001034054-22-000002/primary-document.html',
 '10k_files/sec-edgar-filings/0001035002/10-K/0001035002-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001035267/10-K/0001035267-22-000014/primary-document.html',
 '10k_files/sec-edgar-filings/0001035443/10-K/0001035443-22-000040/primary-document.html',
 '10k_files/sec-edgar-filings/0001037038/10-K/0001037038-22-000014/primary-document.html',
 '10k_files/sec-edgar-filings/0001037540/10-K/0001656423-22-000013/primary-document.html',
 '10k_files/sec-edgar-filings/0001037646/10-K/0001037646-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0001037868/10-K/0001037868-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0001038357/10-K/0001038357-22-000039/primary-document.html',
 '10k_files/sec-edgar-filings/0001039684/10-K/0001039684-22-000015/primary-document.html',
 '10k_files/sec-edgar-filings/0001041061/10-K/0001041061-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0001043277/10-K/0001043277-22-000006/primary-document.html',
 '10k_files/sec-edgar-filings/0001043604/10-K/0001043604-22-000017/primary-document.html',
 '10k_files/sec-edgar-filings/0001045609/10-K/0001564590-22-004436/primary-document.html',
 '10k_files/sec-edgar-filings/0001045810/10-K/0001045810-22-000036/primary-document.html',
 '10k_files/sec-edgar-filings/0001047862/10-K/0001047862-22-000039/primary-document.html',
 '10k_files/sec-edgar-filings/0001048286/10-K/0001628280-22-002666/primary-document.html',
 '10k_files/sec-edgar-filings/0001048695/10-K/0001048695-22-000033/primary-document.html',
 '10k_files/sec-edgar-filings/0001048911/10-K/0000950170-22-012762/primary-document.html',
 '10k_files/sec-edgar-filings/0001050915/10-K/0001050915-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0001051470/10-K/0001051470-22-000019/primary-document.html',
 '10k_files/sec-edgar-filings/0001053507/10-K/0001053507-22-000017/primary-document.html',
 '10k_files/sec-edgar-filings/0001057352/10-K/0001057352-22-000027/primary-document.html',
 '10k_files/sec-edgar-filings/0001058090/10-K/0001058090-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0001058290/10-K/0001058290-22-000023/primary-document.html',
 '10k_files/sec-edgar-filings/0001059556/10-K/0001059556-22-000012/primary-document.html',
 '10k_files/sec-edgar-filings/0001060391/10-K/0001060391-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001063761/10-K/0001558370-22-001845/primary-document.html',
 '10k_files/sec-edgar-filings/0001065088/10-K/0001065088-22-000006/primary-document.html',
 '10k_files/sec-edgar-filings/0001065280/10-K/0001065280-22-000036/primary-document.html',
 '10k_files/sec-edgar-filings/0001065696/10-K/0001065696-22-000006/primary-document.html',
 '10k_files/sec-edgar-filings/0001067701/10-K/0001067701-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0001067983/10-K/0001564590-22-007322/primary-document.html',
 '10k_files/sec-edgar-filings/0001070750/10-K/0000950170-22-001965/primary-document.html',
 '10k_files/sec-edgar-filings/0001071739/10-K/0001071739-22-000071/primary-document.html',
 '10k_files/sec-edgar-filings/0001075531/10-K/0001075531-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0001086222/10-K/0001086222-22-000058/primary-document.html',
 '10k_files/sec-edgar-filings/0001090012/10-K/0001564590-22-005321/primary-document.html',
 '10k_files/sec-edgar-filings/0001090727/10-K/0001090727-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001090872/10-K/0001090872-22-000026/primary-document.html',
 '10k_files/sec-edgar-filings/0001091667/10-K/0001091667-22-000024/primary-document.html',
 '10k_files/sec-edgar-filings/0001093557/10-K/0001093557-22-000014/primary-document.html',
 '10k_files/sec-edgar-filings/0001094285/10-K/0001094285-22-000049/primary-document.html',
 '10k_files/sec-edgar-filings/0001095073/10-K/0001095073-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0001097149/10-K/0001097149-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0001097864/10-K/0001628280-22-002416/primary-document.html',
 '10k_files/sec-edgar-filings/0001099219/10-K/0001099219-22-000014/primary-document.html',
 '10k_files/sec-edgar-filings/0001099800/10-K/0001099800-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0001100682/10-K/0001100682-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001101239/10-K/0001628280-22-003171/primary-document.html',
 '10k_files/sec-edgar-filings/0001103982/10-K/0001103982-22-000003/primary-document.html',
 '10k_files/sec-edgar-filings/0001108524/10-K/0001108524-22-000013/primary-document.html',
 '10k_files/sec-edgar-filings/0001109357/10-K/0001109357-22-000039/primary-document.html',
 '10k_files/sec-edgar-filings/0001110803/10-K/0001110803-22-000013/primary-document.html',
 '10k_files/sec-edgar-filings/0001111711/10-K/0001111711-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001113169/10-K/0001113169-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0001116132/10-K/0001116132-22-000018/primary-document.html',
 '10k_files/sec-edgar-filings/0001120193/10-K/0001120193-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001121788/10-K/0000950170-22-001303/primary-document.html',
 '10k_files/sec-edgar-filings/0001123360/10-K/0001123360-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001126328/10-K/0001104659-22-020401/primary-document.html',
 '10k_files/sec-edgar-filings/0001130310/10-K/0001130310-22-000023/primary-document.html',
 '10k_files/sec-edgar-filings/0001133421/10-K/0001133421-22-000004/primary-document.html',
 '10k_files/sec-edgar-filings/0001136869/10-K/0001564590-22-007160/primary-document.html',
 '10k_files/sec-edgar-filings/0001136893/10-K/0001136893-22-000038/primary-document.html',
 '10k_files/sec-edgar-filings/0001137774/10-K/0001137774-22-000038/primary-document.html',
 '10k_files/sec-edgar-filings/0001137789/10-K/0001137789-22-000055/primary-document.html',
 '10k_files/sec-edgar-filings/0001138118/10-K/0001138118-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0001140536/10-K/0000950170-22-001932/primary-document.html',
 '10k_files/sec-edgar-filings/0001140859/10-K/0001140859-22-000098/primary-document.html',
 '10k_files/sec-edgar-filings/0001141391/10-K/0001141391-22-000023/primary-document.html',
 '10k_files/sec-edgar-filings/0001156039/10-K/0001156039-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0001156375/10-K/0001156375-22-000076/primary-document.html',
 '10k_files/sec-edgar-filings/0001158449/10-K/0001158449-22-000037/primary-document.html',
 '10k_files/sec-edgar-filings/0001163165/10-K/0001562762-22-000031/primary-document.html',
 '10k_files/sec-edgar-filings/0001164727/10-K/0001164727-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001166691/10-K/0001166691-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0001170010/10-K/0001170010-22-000017/primary-document.html',
 '10k_files/sec-edgar-filings/0001174922/10-K/0001174922-22-000031/primary-document.html',
 '10k_files/sec-edgar-filings/0001175454/10-K/0001628280-22-004531/primary-document.html',
 '10k_files/sec-edgar-filings/0001179929/10-K/0001179929-22-000025/primary-document.html',
 '10k_files/sec-edgar-filings/0001260221/10-K/0001260221-22-000065/primary-document.html',
 '10k_files/sec-edgar-filings/0001262039/10-K/0001262039-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0001267238/10-K/0001267238-22-000006/primary-document.html',
 '10k_files/sec-edgar-filings/0001274494/10-K/0001274494-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0001278021/10-K/0000950170-22-001811/primary-document.html',
 '10k_files/sec-edgar-filings/0001280452/10-K/0001437749-22-004460/primary-document.html',
 '10k_files/sec-edgar-filings/0001281761/10-K/0001281761-22-000016/primary-document.html',
 '10k_files/sec-edgar-filings/0001283699/10-K/0001283699-22-000018/primary-document.html',
 '10k_files/sec-edgar-filings/0001285785/10-K/0001618034-22-000004/primary-document.html',
 '10k_files/sec-edgar-filings/0001286681/10-K/0000950170-22-002426/primary-document.html',
 '10k_files/sec-edgar-filings/0001289490/10-K/0001628280-22-004274/primary-document.html',
 '10k_files/sec-edgar-filings/0001297996/10-K/0001558370-22-002195/primary-document.html',
 '10k_files/sec-edgar-filings/0001300514/10-K/0001300514-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001306830/10-K/0001306830-22-000017/primary-document.html',
 '10k_files/sec-edgar-filings/0001318605/10-K/0000950170-22-000796/primary-document.html',
 '10k_files/sec-edgar-filings/0001324404/10-K/0001324404-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0001324424/10-K/0001324424-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0001326160/10-K/0001326160-22-000072/primary-document.html',
 '10k_files/sec-edgar-filings/0001326801/10-K/0001326801-22-000018/primary-document.html',
 '10k_files/sec-edgar-filings/0001335258/10-K/0001335258-22-000019/primary-document.html',
 '10k_files/sec-edgar-filings/0001336920/10-K/0001336920-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001341439/10-K/0001564590-22-023675/primary-document.html',
 '10k_files/sec-edgar-filings/0001352010/10-K/0001352010-22-000020/primary-document.html',
 '10k_files/sec-edgar-filings/0001364742/10-K/0001564590-22-007117/primary-document.html',
 '10k_files/sec-edgar-filings/0001370637/10-K/0001370637-22-000024/primary-document.html',
 '10k_files/sec-edgar-filings/0001373715/10-K/0001373715-22-000024/primary-document.html',
 '10k_files/sec-edgar-filings/0001374310/10-K/0001558370-22-001386/primary-document.html',
 '10k_files/sec-edgar-filings/0001383312/10-K/0001383312-22-000037/primary-document.html',
 '10k_files/sec-edgar-filings/0001385157/10-K/0001558370-22-017931/primary-document.html',
 '10k_files/sec-edgar-filings/0001389170/10-K/0001564590-22-006563/primary-document.html',
 '10k_files/sec-edgar-filings/0001390777/10-K/0001390777-22-000043/primary-document.html',
 '10k_files/sec-edgar-filings/0001393311/10-K/0001393311-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0001393612/10-K/0001393612-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001396009/10-K/0001396009-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0001402057/10-K/0001402057-22-000020/primary-document.html',
 '10k_files/sec-edgar-filings/0001403161/10-K/0001403161-22-000081/primary-document.html',
 '10k_files/sec-edgar-filings/0001403568/10-K/0001558370-22-004330/primary-document.html',
 '10k_files/sec-edgar-filings/0001408198/10-K/0001564590-22-004803/primary-document.html',
 '10k_files/sec-edgar-filings/0001410636/10-K/0001410636-22-000048/primary-document.html',
 '10k_files/sec-edgar-filings/0001413329/10-K/0001413329-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0001413447/10-K/0001413447-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0001418135/10-K/0001418135-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0001419612/10-K/0001178913-22-000760/primary-document.html',
 '10k_files/sec-edgar-filings/0001437107/10-K/0001437107-22-000031/primary-document.html',
 '10k_files/sec-edgar-filings/0001442145/10-K/0001437749-22-004083/primary-document.html',
 '10k_files/sec-edgar-filings/0001463101/10-K/0001463101-22-000016/primary-document.html',
 '10k_files/sec-edgar-filings/0001466258/10-K/0001466258-22-000031/primary-document.html',
 '10k_files/sec-edgar-filings/0001467373/10-K/0001467373-22-000295/primary-document.html',
 '10k_files/sec-edgar-filings/0001467858/10-K/0001467858-22-000034/primary-document.html',
 '10k_files/sec-edgar-filings/0001474735/10-K/0001437749-22-004080/primary-document.html',
 '10k_files/sec-edgar-filings/0001478242/10-K/0001478242-22-000041/primary-document.html',
 '10k_files/sec-edgar-filings/0001489393/10-K/0001489393-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0001501585/10-K/0001501585-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001506307/10-K/0001506307-22-000018/primary-document.html',
 '10k_files/sec-edgar-filings/0001510295/10-K/0001510295-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0001513761/10-K/0001558370-22-002516/primary-document.html',
 '10k_files/sec-edgar-filings/0001521332/10-K/0001521332-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0001524472/10-K/0001524472-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0001534701/10-K/0001534701-22-000078/primary-document.html',
 '10k_files/sec-edgar-filings/0001539838/10-K/0001539838-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0001551152/10-K/0001551152-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001551182/10-K/0001551182-22-000004/primary-document.html',
 '10k_files/sec-edgar-filings/0001555280/10-K/0001555280-22-000078/primary-document.html',
 '10k_files/sec-edgar-filings/0001564708/10-K/0001564708-22-000265/primary-document.html',
 '10k_files/sec-edgar-filings/0001571949/10-K/0001571949-22-000006/primary-document.html',
 '10k_files/sec-edgar-filings/0001579241/10-K/0001579241-22-000019/primary-document.html',
 '10k_files/sec-edgar-filings/0001585689/10-K/0001585689-22-000013/primary-document.html',
 '10k_files/sec-edgar-filings/0001590895/10-K/0001590895-22-000061/primary-document.html',
 '10k_files/sec-edgar-filings/0001590955/10-K/0001564590-22-005562/primary-document.html',
 '10k_files/sec-edgar-filings/0001596532/10-K/0001596532-22-000026/primary-document.html',
 '10k_files/sec-edgar-filings/0001596783/10-K/0001596783-22-000129/primary-document.html',
 '10k_files/sec-edgar-filings/0001601046/10-K/0001601046-22-000161/primary-document.html',
 '10k_files/sec-edgar-filings/0001601712/10-K/0001601712-22-000053/primary-document.html',
 '10k_files/sec-edgar-filings/0001604778/10-K/0001604778-22-000029/primary-document.html',
 '10k_files/sec-edgar-filings/0001613103/10-K/0001613103-22-000023/primary-document.html',
 '10k_files/sec-edgar-filings/0001618921/10-K/0001618921-22-000064/primary-document.html',
 '10k_files/sec-edgar-filings/0001633917/10-K/0001633917-22-000027/primary-document.html',
 '10k_files/sec-edgar-filings/0001637459/10-K/0001637459-22-000018/primary-document.html',
 '10k_files/sec-edgar-filings/0001645590/10-K/0001645590-22-000071/primary-document.html',
 '10k_files/sec-edgar-filings/0001652044/10-K/0001652044-22-000019/primary-document.html',
 '10k_files/sec-edgar-filings/0001659166/10-K/0001659166-22-000054/primary-document.html',
 '10k_files/sec-edgar-filings/0001666700/10-K/0001666700-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0001679273/10-K/0001558370-22-011121/primary-document.html',
 '10k_files/sec-edgar-filings/0001682852/10-K/0001682852-22-000012/primary-document.html',
 '10k_files/sec-edgar-filings/0001687229/10-K/0001687229-22-000002/primary-document.html',
 '10k_files/sec-edgar-filings/0001688568/10-K/0001688568-22-000027/primary-document.html',
 '10k_files/sec-edgar-filings/0001699150/10-K/0001628280-22-003991/primary-document.html',
 '10k_files/sec-edgar-filings/0001701605/10-K/0001701605-22-000050/primary-document.html',
 '10k_files/sec-edgar-filings/0001705696/10-K/0001705696-22-000046/primary-document.html',
 '10k_files/sec-edgar-filings/0001707925/10-K/0001628280-22-004180/primary-document.html',
 '10k_files/sec-edgar-filings/0001711269/10-K/0001711269-22-000008/primary-document.html',
 '10k_files/sec-edgar-filings/0001725057/10-K/0000950170-22-002143/primary-document.html',
 '10k_files/sec-edgar-filings/0001730168/10-K/0001730168-22-000118/primary-document.html',
 '10k_files/sec-edgar-filings/0001732845/10-K/0000950170-22-025444/primary-document.html',
 '10k_files/sec-edgar-filings/0001739940/10-K/0001739940-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001744489/10-K/0001744489-22-000213/primary-document.html',
 '10k_files/sec-edgar-filings/0001748790/10-K/0001748790-22-000024/primary-document.html',
 '10k_files/sec-edgar-filings/0001751788/10-K/0001751788-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0001754301/10-K/0001628280-22-022584/primary-document.html',
 '10k_files/sec-edgar-filings/0001755672/10-K/0001755672-22-000005/primary-document.html',
 '10k_files/sec-edgar-filings/0001757898/10-K/0001757898-22-000011/primary-document.html',
 '10k_files/sec-edgar-filings/0001781335/10-K/0001781335-22-000007/primary-document.html',
 '10k_files/sec-edgar-filings/0001783180/10-K/0001783180-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0001792044/10-K/0001792044-22-000010/primary-document.html',
 '10k_files/sec-edgar-filings/0001821825/10-K/0001821825-22-000002/primary-document.html',
 '10k_files/sec-edgar-filings/0001841666/10-K/0001784031-22-000009/primary-document.html',
 '10k_files/sec-edgar-filings/0001868275/10-K/0001868275-22-000020/primary-document.html']
import re
from requests_html import HTMLSession

session = HTMLSession()
session.headers.update({'User-Agent':'Evan Trock ert226@lehigh.edu'})

new_df = []

for item in files:
    segment = item.split('/') # split the path into segments
    cik = segment[-4]  
    accession_number = segment[-2]       
    url = f'https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}-index.html'
    sleep(.3)
    #print(url)  
    r = session.get(url)
    date = r.html.find('#contentDiv > div:nth-child(1) > div.formContent > div:nth-child(1) > div:nth-child(2)', first=True).text
    new_df.append({'cik': cik, 'accession_number': accession_number, 'File Date': date, 'url': url })
df = pd.DataFrame(new_df)
print(df)

    
            cik      accession_number   File Date  \
0    0000001800  0001104659-22-025141  2022-02-18   
1    0000002488  0000002488-22-000016  2022-02-03   
2    0000002969  0000002969-22-000054  2022-11-22   
3    0000004127  0000004127-22-000038  2022-11-23   
4    0000004281  0000004281-22-000004  2022-02-14   
..          ...                   ...         ...   
493  0001783180  0001783180-22-000010  2022-02-08   
494  0001792044  0001792044-22-000010  2022-02-28   
495  0001821825  0001821825-22-000002  2022-03-21   
496  0001841666  0001784031-22-000009  2022-02-22   
497  0001868275  0001868275-22-000020  2022-02-25   

                                                   url  
0    https://www.sec.gov/Archives/edgar/data/000000...  
1    https://www.sec.gov/Archives/edgar/data/000000...  
2    https://www.sec.gov/Archives/edgar/data/000000...  
3    https://www.sec.gov/Archives/edgar/data/000000...  
4    https://www.sec.gov/Archives/edgar/data/000000...  
..                                                 ...  
493  https://www.sec.gov/Archives/edgar/data/000178...  
494  https://www.sec.gov/Archives/edgar/data/000179...  
495  https://www.sec.gov/Archives/edgar/data/000182...  
496  https://www.sec.gov/Archives/edgar/data/000184...  
497  https://www.sec.gov/Archives/edgar/data/000186...  

[498 rows x 4 columns]
print(sp500.columns)
print(df.columns)
Index(['Symbol', 'Security', 'SEC filings', 'GICS Sector', 'GICS Sub-Industry',
       'Headquarters Location', 'Date first added', 'CIK', 'Founded'],
      dtype='object')
Index(['cik', 'accession_number', 'File Date', 'url'], dtype='object')
sp500['CIK'] = sp500['CIK'].astype(str).str.zfill(10)
df['cik'] = df['cik'].astype(str).str.zfill(10)

# Perform inner merge
merged_df = pd.merge(sp500, df, left_on='CIK',right_on = 'cik', how='inner')
merged_df = merged_df.drop(columns=['cik', 'url', 'GICS Sector', 'GICS Sub-Industry', 'SEC filings', 'Headquarters Location', 'Date first added', 'Founded'])

merged_df

Symbol Security CIK accession_number File Date
0 MMM 3M 0000066740 0000066740-22-000010 2022-02-09
1 AOS A. O. Smith 0000091142 0000091142-22-000028 2022-02-11
2 ABT Abbott 0000001800 0001104659-22-025141 2022-02-18
3 ABBV AbbVie 0001551152 0001551152-22-000007 2022-02-18
4 ACN Accenture 0001467373 0001467373-22-000295 2022-10-12
... ... ... ... ... ...
496 YUM Yum! Brands 0001041061 0001041061-22-000009 2022-02-23
497 ZBRA Zebra Technologies 0000877212 0000877212-22-000026 2022-02-10
498 ZBH Zimmer Biomet 0001136869 0001564590-22-007160 2022-02-25
499 ZION Zions Bancorporation 0000109380 0000109380-22-000072 2022-02-25
500 ZTS Zoetis 0001555280 0001555280-22-000078 2022-02-15

501 rows × 5 columns

import pandas_market_calendars as mcal
import pandas as pd

# Get NYSE calendar
nyse = mcal.get_calendar('NYSE')

# Convert 'File Date' to datetime
merged_df['File Date'] = pd.to_datetime(merged_df['File Date'], errors='coerce')

# General function to calculate the nth trading day after the file date
def get_nth_trading_day_after(date, n):
    if pd.isnull(date):
        return None
    # Get the NYSE trading schedule for a period after the file date
    schedule = nyse.schedule(start_date=date, end_date=date + pd.Timedelta(days=n*2))
    trading_days = schedule.index
    if len(trading_days) >= n + 1:  # Ensure there are at least n+1 trading days
        return trading_days[n].date()  # Return the nth trading day after the file date
    else:
        return None

# Apply the function to calculate the 3rd and 10th trading days
merged_df['3rd_day_after'] = merged_df['File Date'].apply(lambda date: get_nth_trading_day_after(date, 3))
merged_df['10th_day_after'] = merged_df['File Date'].apply(lambda date: get_nth_trading_day_after(date, 10))

# Display the updated DataFrame
merged_df
Symbol Security CIK accession_number File Date 3rd_day_after 10th_day_after
0 MMM 3M 0000066740 0000066740-22-000010 2022-02-09 2022-02-14 2022-02-24
1 AOS A. O. Smith 0000091142 0000091142-22-000028 2022-02-11 2022-02-16 2022-02-28
2 ABT Abbott 0000001800 0001104659-22-025141 2022-02-18 2022-02-24 2022-03-07
3 ABBV AbbVie 0001551152 0001551152-22-000007 2022-02-18 2022-02-24 2022-03-07
4 ACN Accenture 0001467373 0001467373-22-000295 2022-10-12 2022-10-17 2022-10-26
... ... ... ... ... ... ... ...
496 YUM Yum! Brands 0001041061 0001041061-22-000009 2022-02-23 2022-02-28 2022-03-09
497 ZBRA Zebra Technologies 0000877212 0000877212-22-000026 2022-02-10 2022-02-15 2022-02-25
498 ZBH Zimmer Biomet 0001136869 0001564590-22-007160 2022-02-25 2022-03-02 2022-03-11
499 ZION Zions Bancorporation 0000109380 0000109380-22-000072 2022-02-25 2022-03-02 2022-03-11
500 ZTS Zoetis 0001555280 0001555280-22-000078 2022-02-15 2022-02-18 2022-03-02

501 rows × 7 columns

import yfinance as yf
import pandas as pd

# Initialize a new column for percent change
merged_df['3rd_Percent_Change'] = None
merged_df['10th_Percent_Change'] = None

# Iterate through each row in merged_df
for index, row in merged_df.iterrows():
    ticker = row['Symbol']
    start = row['File Date']
    end3 = row['3rd_day_after']
    end10= row['10th_day_after']
    
    # Fetch stock prices for the given ticker and date range
    stock_data = yf.download(ticker, start=start, end=end3, progress=False)
    
    # Calculate percent change from the first to the last available price
    if not stock_data.empty:
        first_close = stock_data['Close'].iloc[0]
        last_close = stock_data['Close'].iloc[-1]
        percent_change = (((last_close - first_close) / first_close) * 100).iloc[0]
        merged_df.at[index, '3rd_Percent_Change'] = percent_change

    stock_data = yf.download(ticker, start=start, end=end10, progress=False)
    if not stock_data.empty:
        first_close = stock_data['Close'].iloc[0]
        last_close = stock_data['Close'].iloc[-1]
        percent_change = (((last_close - first_close) / first_close) * 100).iloc[0]
        merged_df.at[index, '10th_Percent_Change'] = percent_change

# Display the updated DataFrame
merged_df
YF.download() has changed argument auto_adjust default to True



1 Failed download:
['ATVI']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['ATVI']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['ABC']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['ABC']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['BRK.B']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['BRK.B']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['BF.B']: YFPricesMissingError('possibly delisted; no price data found  (1d 2022-06-17 00:00:00 -> 2022-06-23)')

1 Failed download:
['BF.B']: YFPricesMissingError('possibly delisted; no price data found  (1d 2022-06-17 00:00:00 -> 2022-07-05)')

1 Failed download:
['CTLT']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['CTLT']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['CDAY']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['CDAY']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['DISH']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['DISH']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['RE']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['RE']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['FISV']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['FISV']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['FLT']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['FLT']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['PEAK']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['PEAK']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['MRO']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['MRO']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['PKI']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['PKI']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['PXD']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['PXD']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['SIVB']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['SIVB']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['WRK']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['WRK']: YFTzMissingError('possibly delisted; no timezone found')
Symbol Security CIK accession_number File Date 3rd_day_after 10th_day_after 3rd_Percent_Change 10th_Percent_Change
0 MMM 3M 0000066740 0000066740-22-000010 2022-02-09 2022-02-14 2022-02-24 -2.505488 -10.916856
1 AOS A. O. Smith 0000091142 0000091142-22-000028 2022-02-11 2022-02-16 2022-02-28 1.612681 -2.692448
2 ABT Abbott 0000001800 0001104659-22-025141 2022-02-18 2022-02-24 2022-03-07 0.393869 3.955834
3 ABBV AbbVie 0001551152 0001551152-22-000007 2022-02-18 2022-02-24 2022-03-07 1.895425 4.533798
4 ACN Accenture 0001467373 0001467373-22-000295 2022-10-12 2022-10-17 2022-10-26 1.059706 12.212577
... ... ... ... ... ... ... ... ... ...
496 YUM Yum! Brands 0001041061 0001041061-22-000009 2022-02-23 2022-02-28 2022-03-09 1.751798 -5.22247
497 ZBRA Zebra Technologies 0000877212 0000877212-22-000026 2022-02-10 2022-02-15 2022-02-25 -8.617104 -10.993725
498 ZBH Zimmer Biomet 0001136869 0001564590-22-007160 2022-02-25 2022-03-02 2022-03-11 0.918856 -2.983268
499 ZION Zions Bancorporation 0000109380 0000109380-22-000072 2022-02-25 2022-03-02 2022-03-11 -8.522337 -9.11396
500 ZTS Zoetis 0001555280 0001555280-22-000078 2022-02-15 2022-02-18 2022-03-02 -2.371548 -2.63957

501 rows × 9 columns

import re
import pandas as pd
from bs4 import BeautifulSoup

#measuring sentiment
#Load in Sentiment Values
#ML Negative
with open('inputs/ML_negative_unigram.txt', 'r') as file:
    BHR_negative = [line.strip().lower() for line in file]
BHR_negative.sort()
#ML Positive
with open('inputs/ML_positive_unigram.txt', 'r') as file:
    BHR_positive = [line.strip().lower() for line in file]
len(BHR_negative), len(BHR_positive)
BHR_positive.sort()
#LM Positive
file_path = "inputs/LM_MasterDictionary_1993-2021.csv" 
df_LM = pd.read_csv(file_path)
LM_positive = df_LM[df_LM['Positive'] > 0]['Word'].tolist()
LM_positive = [e.lower() for e in LM_positive]
#LM Negative
LM_negative = df_LM[df_LM['Negative'] > 0]['Word'].tolist()
LM_negative = [e.lower() for e in LM_negative]

print(len(LM_positive), len(LM_negative), len(BHR_positive), len(BHR_negative))
print(LM_positive[:5], LM_negative[:5], BHR_positive[:5], BHR_negative[:5])


347 2345 75 94
['able', 'abundance', 'abundant', 'acclaimed', 'accomplish'] ['abandon', 'abandoned', 'abandoning', 'abandonment', 'abandonments'] ['above', 'achieved', 'across', 'basis', 'benefit'] ['actions', 'address', 'affected', 'affecting', 'anticipated']
SC_words = ['Supply Chain', 'supply chain', 'supply chains', 'Supply Chains', 'supply-chain', 'supply-chains', 
            'logistics', 'logistic', 'logistical', 'logistically', 'logistics', 'logistics', 'logistics', 'logistics',
            'procurement', 'procure', 'procures', 'procured', 'procuring','distribution', 'distribute', 
            'distributes', 'distributed', 'distributing','inventory', 'inventories']
Risk_words = ['risk', 'risky', 'risks', 'risked', 'risking', 'riskiness', 'riskier', 'riskiest', 'riskily', 
              'riskinesses', 'riskiness', 'hazard', 'hazardous', 'hazards', 'hazardously', 'hazardousness',]
Comp_words = ['Compete', 'competes', 'competed', 'competing', 'competition', 'competitive', 'competitor', 'competitors', 
               'competitiveness', 'competitively', 'rival', 'rivals', 'rivalry', 'rivalries', 'contend', 'contending',
               'contender', 'contenders', 'contest', 'contests', 'contestant', 'contestants', 'contestable', 'contestably']
# Move around the 10k_files zip
done_with_downloads = True 

if os.path.exists('10k_files/sec-edgar-filings') and \
    not os.path.exists('10k_files/10k_files.zip') and \
    done_with_downloads:
    
    # zip the folder (2GB --> 150mb)
    shutil.make_archive('10k_files', 'zip', '10k_files')
    
    # delete the folder with all the files
    # I've commented this out for now so that we can keep opening up the 10-Ks up easily 
    shutil.rmtree('10k_files/sec-edgar-filings')
    
    # put the zip file in the `10k_files` folder
    shutil.move('10k_files.zip', '10k_files/')
import os
import re
import pandas as pd
from zipfile import ZipFile
from bs4 import BeautifulSoup
import fnmatch

# Ensure required columns exist in merged_df
sentiment_columns = [
    'LM_positive', 'LM_negative', 'BHR_positive', 'BHR_negative',
    'PosSC', 'NegSC', 'PosRisk', 'NegRisk', 'PosComp', 'NegComp'
]
for col in sentiment_columns:
    if col not in merged_df.columns:
        merged_df[col] = 0.0  # Initialize as float for ratios

# Open the zip file once
with ZipFile('10k_files/10k_files.zip', 'r') as zipfolder:
    file_list = zipfolder.namelist()
    
    for index, row in merged_df.iterrows():
        try:
            cik = str(row['CIK']).zfill(10)  # Zero-pad CIK
            filing_date = pd.to_datetime(row['File Date'], errors='coerce').strftime('%Y-%m-%d')
            
            if pd.isnull(filing_date):
                print(f"Invalid filing date for row {index}")
                continue
            
            firm_folder = f"sec-edgar-filings/{cik}/10-K/*/*.html"
            possible_files = fnmatch.filter(file_list, firm_folder)
            
            if not possible_files:
                continue
            
            fpath = possible_files[0]  # Use the first match
            merged_df.at[index, 'accession_number'] = fpath.split('/')[3]
            
            with zipfolder.open(fpath) as report_file:
                html = report_file.read().decode(encoding="utf-8")
            
            # Clean the HTML
            soup = BeautifulSoup(html, features='lxml-xml')
            for div in soup.find_all("div", {'style': 'display:none'}):
                div.decompose()
            
            document = soup.text.lower()
            document = re.sub(r'\W', ' ', document)   # Remove punctuation
            document = re.sub(r'\s+', ' ', document)  # Remove excess whitespace
            doc_length = len(document.split())
            
            # Compute sentiment ratios using regex
            merged_df.at[index, 'LM_positive'] = len(re.findall(r'\b(' + '|'.join(LM_positive) + r')\b', document)) / doc_length
            merged_df.at[index, 'LM_negative'] = len(re.findall(r'\b(' + '|'.join(LM_negative) + r')\b', document)) / doc_length
            merged_df.at[index, 'BHR_positive'] = len(re.findall(r'\b(' + '|'.join(BHR_positive) + r')\b', document)) / doc_length
            merged_df.at[index, 'BHR_negative'] = len(re.findall(r'\b(' + '|'.join(BHR_negative) + r')\b', document)) / doc_length
            
            # Compute sentiment proximity using NEAR_finder
            merged_df.at[index, 'PosSC'] = NEAR_finder(SC_words, LM_positive, document)[0] / doc_length
            merged_df.at[index, 'NegSC'] = NEAR_finder(SC_words, LM_negative, document)[0] / doc_length
            merged_df.at[index, 'PosRisk'] = NEAR_finder(Risk_words, LM_positive, document)[0] / doc_length
            merged_df.at[index, 'NegRisk'] = NEAR_finder(Risk_words, LM_negative, document)[0] / doc_length
            merged_df.at[index, 'PosComp'] = NEAR_finder(Comp_words, LM_positive, document)[0] / doc_length
            merged_df.at[index, 'NegComp'] = NEAR_finder(Comp_words, LM_negative, document)[0] / doc_length
         
        except Exception as e:
            print(f"Error processing row {index}: {e}")

merged_df

# import seaborn as sns
# import matplotlib.pyplot as plt
# merged_df['Day'] = pd.to_datetime(merged_df['File Date']).dt.day

# greater = merged_df[merged_df['Percent Change'] > 1]
# less = merged_df[merged_df['Percent Change'] < -0.75]

# greater_avg = greater['LM_positive'].mean()
# less_avg = less['LM_positive'].mean()

# # Prepare data for the bar chart
# avg_data = {
#     'Group': ['Percent Change > 1%', 'Percent Change < -0.75%'],
#     'Average LM Positive': [greater_avg, less_avg]
# }

# # Create a DataFrame for plotting
# avg_df = pd.DataFrame(avg_data)

# # Plot the bar chart
# sns.barplot(data=avg_df, x='Group', y='Average LM Positive').set(
#         title='Average LM Positive by Group', 
#         ylabel = 'Average LM Positive', xlabel = 'Group')
# plt.show()

#correlation table
import pandas as pd

# List of sentiment measures (5 positive and 5 negative)
sentiment_measures = ['LM_positive', 'LM_negative', 'BHR_positive', 'BHR_negative', 
                      'PosSC', 'NegSC', 'PosRisk', 'NegRisk', 'PosComp', 'NegComp']

# List of return measures
return_measures = ['3rd_Percent_Change', '10th_Percent_Change']

# Create an empty DataFrame to store the correlations
correlation_table = pd.DataFrame(index=sentiment_measures, columns=return_measures)

# Calculate the correlations
for sentiment in sentiment_measures:
    for ret in return_measures:
        correlation_table.loc[sentiment, ret] = merged_df[sentiment].corr(merged_df[ret])

correlation_table.to_csv('output/analysis_sample.csv')
# Display the correlation table
correlation_table
3rd_Percent_Change 10th_Percent_Change
LM_positive -0.030946 -0.022247
LM_negative -0.018455 -0.081565
BHR_positive 0.038237 -0.002071
BHR_negative 0.078296 0.076265
PosSC 0.005911 0.051537
NegSC -0.008613 -0.011625
PosRisk -0.123767 -0.155415
NegRisk -0.120052 -0.148263
PosComp -0.041347 -0.096421
NegComp -0.001314 -0.053195
#scatterplot with regression lines
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# List of all sentiment measures (5 positive and 5 negative)
sentiment_measures = ['LM_positive', 'LM_negative', 'BHR_positive', 'BHR_negative', 
                      'PosSC', 'NegSC', 'PosRisk', 'NegRisk', 'PosComp', 'NegComp']

# Create a long-form DataFrame for plotting
plot_data_3rd = merged_df.melt(
    id_vars=['Symbol', '3rd_Percent_Change'], value_vars=sentiment_measures, 
    var_name='Sentiment Measure', value_name='Sentiment Value'
)
plot_data_3rd['Return Measure'] = '3rd Percent Change'

plot_data_10th = merged_df.melt(
    id_vars=['Symbol', '10th_Percent_Change'], value_vars=sentiment_measures,
    var_name='Sentiment Measure', value_name='Sentiment Value'
)
plot_data_10th['Return Measure'] = '10th Percent Change'

# Combine the two datasets
plot_data = pd.concat([
    plot_data_3rd.rename(columns={'3rd_Percent_Change': 'Percent Change'}),
    plot_data_10th.rename(columns={'10th_Percent_Change': 'Percent Change'})
])

# Ensure 'Sentiment Value' and 'Percent Change' are numeric
plot_data['Sentiment Value'] = pd.to_numeric(plot_data['Sentiment Value'], errors='coerce')
plot_data['Percent Change'] = pd.to_numeric(plot_data['Percent Change'], errors='coerce')

# Drop rows with NaN values in these columns
plot_data = plot_data.dropna(subset=['Sentiment Value', 'Percent Change'])

# Create a FacetGrid for scatter plots with regression lines
g = sns.FacetGrid(
    plot_data, col='Sentiment Measure', row='Return Measure', sharex=False, sharey=False, height=4
)
# Map scatter plots with regression lines
g.map(sns.regplot, 'Sentiment Value', 'Percent Change', scatter_kws={'s': 10}, line_kws={'color': 'red'})

# Add titles and adjust layout
g.set_titles(col_template='{col_name}')
g.set_axis_labels('Sentiment Value', 'Percent Change')
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Scatter Plots with Regression Lines: Sentiment Measures vs. Return Measures')

# Show the plot
plt.show()
plot_data.to_csv('output/Regression_data.csv')

png