Skip to content

Commit 5e8c6e4

Browse files
Add files via upload
1 parent 5e26049 commit 5e8c6e4

17 files changed

+218
-0
lines changed

parse/My Yaml.yaml

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
DEFAULT:
2+
media_dir: 'C:\Users\beach\Desktop\media'
3+
4+
Folder Names to Include in Scan:
5+
RECURSIVE=YES:
6+
type_dir: ['movies', 'tvshows']
7+
8+
RECURSIVE=NO:
9+
type_dir: ['home movies']
10+
11+
Label tags to move and create folder based on tag name:
12+
tags: ['144p', '240p', '360p', '480p', '720p', 'HD']
13+
14+
Tags to be removed from folder/files:
15+
tags: ['-(A)', '240p', '360p', '480p', '720p', 'HD', 'h.265', 'mkv', 'upload']
16+
17+
Suffix Deletion remove the last instance of tag and everything after last instance of:
18+
tags: ['-', 'HD', '.', '-', '(']
19+
20+
Prefix Deletion remove first instance of tag and everything before it:
21+
tags: ['SD', 'HD', '.', '-', ')']
22+
23+
Directory to output folder:
24+
output_dir: 'C:\Users\beach\Desktop\A different folder than the media_dir'

parse/argparse_.py

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import argparse_
2+
3+
parser = argparse_.ArgumentParser(description='This script scrapes "https://www.samstores.com"')
4+
parser.add_argument('--l', type=str,
5+
metavar='link',
6+
help='What is the link you want to scrape?')
7+
args = parser.parse_args()
8+
9+
url = args.l

parse/config.ini

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[Folder Names to Include in Scan]
2+
[RECURSIVE=Yes]
3+
type_dir = 'movies'
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
df1.reset_index(level=0, inplace=True)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import sqlite3 as lite
2+
3+
query = f'''CREATE TABLE temp_new AS
4+
SELECT
5+
Ticker as stock,
6+
Date as date,
7+
round(`Adj. Open`,2) as open,
8+
round(`Adj. High`,2) as high,
9+
round(`Adj. Low`,2) as low,
10+
round(`Adj. Close`,2) as close,
11+
round(`Adj. Volume`, 2) as volume
12+
FROM {self.db_daily_table_name}
13+
WHERE Date>='{first_year}-01-01'
14+
'''
15+
self.big_db.cur.execute(query)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import re
2+
3+
def get_email(raw_txt):
4+
try:
5+
regex = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
6+
match = re.search(regex, raw_txt)
7+
email = match[0]
8+
except:
9+
email = ""
10+
11+
return email
12+
13+
def get_phone(raw_txt):
14+
try:
15+
regex = r"(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}"
16+
match = re.search(regex, raw_txt)
17+
phone = match[0]
18+
except:
19+
phone = ""
20+
21+
return phone

parse/groupby and apply.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import pandas as pd
2+
3+
def make_weekly(d):
4+
return pd.Series({
5+
'Open': d["open"].iloc[0],
6+
'High': d["volume"].max(),
7+
'Low': d["volume"].min(),
8+
'Close': d["close"].iloc[d["open"].count() - 1],
9+
'Volume': d["volume"].sum(),
10+
})
11+
12+
# df1 = df.groupby(df["Date"].index.week).apply(make_weekly)
13+
14+
15+
df["Week"] = pd.to_datetime(df['date']).dt.year.astype(str).str.cat(
16+
pd.to_datetime(df['date']).dt.week.astype(str).str.zfill(2), sep='-')
17+
# display(df)
18+
19+
df1 = df.groupby(['stock', 'Week']).apply(make_weekly)

parse/ingredient_parser_tst.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from ingredient_parser.en import parse
2+
print(parse('2 liters of milk'))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import pandas as pd
2+
from IPython.display import display
3+
4+
dt1 = {
5+
'aa': ['j', 'b', 'e', 'g', 'i', 'c'],
6+
"ab": [4, 2, 5, 6, 1, 7],
7+
}
8+
9+
dt2 = {
10+
'aa': ['b', 'e', 'i', 'j', 'c', 'g'],
11+
"ac": [4, 9, 5, 8, 3, 4],
12+
}
13+
14+
df1 = pd.DataFrame(dt1)
15+
16+
# df1 = df1.set_index('aa')
17+
display(df1)
18+
19+
df2 = pd.DataFrame(dt2)
20+
# df2 = df2.set_index('aa')
21+
display(df2)
22+
23+
# df3 = pd.concat([df1, df2], axis=1, sort=False)
24+
# df3.reset_index(inplace=True)
25+
# df3 = df3.rename(columns = {'index':'aa'})
26+
# display(df3)
27+
28+
df3 = df1.merge(df2, how='left')
29+
df3 = df3.reindex(sorted(df3.columns), axis=1)
30+
31+
df3 = df3[['ac', 'aa', 'ab']]
32+
display(df3)

parse/name_parser.py

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from nameparser import HumanName
2+
from nameparser.config import CONSTANTS
3+
name = HumanName("Ralf Mühlenhöver, Geschäftsführer")
4+
5+
print(name.title)
6+
print(name.first)
7+
print(name.middle)
8+
print(name.last)
9+

parse/parse address.py

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import usaddress
2+
addr='123 Main St. Suite 100 Chicago, IL'
3+
4+
# The parse method will split your address string into components, and label each component.
5+
# expected output: [(u'123', 'AddressNumber'), (u'Main', 'StreetName'), (u'St.', 'StreetNamePostType'), (u'Suite', 'OccupancyType'), (u'100', 'OccupancyIdentifier'), (u'Chicago,', 'PlaceName'), (u'IL', 'StateName')]
6+
usaddress.parse(addr)
7+
8+
# The tag method will try to be a little smarter
9+
# it will merge consecutive components, strip commas, & return an address type
10+
# expected output: (OrderedDict([('AddressNumber', u'123'), ('StreetName', u'Main'), ('StreetNamePostType', u'St.'), ('OccupancyType', u'Suite'), ('OccupancyIdentifier', u'100'), ('PlaceName', u'Chicago'), ('StateName', u'IL')]), 'Street Address')
11+
usaddress.tag(addr)

parse/parse config ini file.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import configparser
2+
3+
# parse config file
4+
config = configparser.RawConfigParser()
5+
config.read('config.ini')
6+
7+
a = config['RECURSIVE=Yes']['type_dir']

parse/parse date.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import dateparser
2+
dateparser.parse('12/12/12')
3+
dateparser.parse('Fri, 12 Dec 2014 10:55:50')
4+
dateparser.parse('Martes 21 de Octubre de 2014') # Spanish (Tuesday 21 October 2014)
5+
dateparser.parse('Le 11 Décembre 2014 à 09:00') # French (11 December 2014 at 09:00)
6+
dateparser.parse('13 января 2015 г. в 13:34') # Russian (13 January 2015 at 13:34)
7+
dateparser.parse('1 เดือนตุลาคม 2005, 1:00 AM') # Thai (1 October 2005, 1:00 AM)

parse/parse human names.py

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from nameparser import HumanName
2+
3+
full_name = "Ayad Saleh Muflahi Jr"
4+
5+
first_name = HumanName(full_name).first
6+
middle_name = HumanName(full_name).middle
7+
last_name = HumanName(full_name).last
8+
title = HumanName(full_name).title
9+
suffix = HumanName(full_name).suffix
10+
11+
print(f"first_name: {first_name}")
12+
print(f"middle_name: {middle_name}")
13+
print(f"last_name: {last_name}")
14+
print(f"title: {title}")
15+
print(f"suffix: {suffix}")

parse/parse yaml file.py

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import yaml
2+
3+
4+
with open("My Yaml.yaml") as file:
5+
# The FullLoader parameter handles the conversion from YAML
6+
# scalar values to Python the dictionary format
7+
config = yaml.load(file, Loader=yaml.FullLoader)
8+
9+
print(config)

parse/parse_html.py

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import requests
2+
from lxml import html
3+
4+
sess = requests.Session()
5+
headers = {
6+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
7+
}
8+
9+
url = "https://stackoverflow.com/"
10+
r = sess.get(url, headers=headers)
11+
tree = html.fromstring(r.text)
12+
13+
links = tree.xpath('//div[@class="summary"]/h3/a/@href')
14+

parse/parse_xml.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import requests
2+
import xml.etree.ElementTree as ET
3+
from bs4 import BeautifulSoup
4+
5+
sess = requests.Session()
6+
headers = {
7+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
8+
}
9+
10+
url = "https://neue-pressemitteilungen.de/post_part1.xml"
11+
r = sess.get(url, headers=headers)
12+
13+
# root = ET.fromstring(r.text)
14+
# links = root.findall('./loc')
15+
# print(links)
16+
17+
soup = BeautifulSoup(r.text, 'lxml')
18+
links = [elm.get_text() for elm in soup.find_all('loc')]
19+
20+
print(links)

0 commit comments

Comments
 (0)