high paying low stress jobs¶
- taken from an article from time.com
In [1]:
import requests
import bs4
In [2]:
from string import punctuation
exclude = set(punctuation)
In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()
In [4]:
url="http://time.com/4081673/high-paying-low-stress-jobs/"
In [5]:
rs = requests.get(url)
In [6]:
rs.status_code
Out[6]:
In [7]:
#rs.content
In [8]:
# with open('high_paying_low_stress_jobs.html', 'wb') as fd:
# for chunk in rs.iter_content():
# fd.write(chunk)
In [9]:
# Make a soup obj
soup = bs4.BeautifulSoup(rs.text,'lxml')
In [10]:
h2 = soup.find_all('h2', {'class':'article-item-title'})
p1 =soup.find_all('p', {'class':'p1'})
p2 = soup.find_all('p', {'class':'p2'})
p4 = soup.find_all('p', {'class':'p4'})
In [11]:
# information
soup.find_all('p',{'class':'p3'})[0].get_text()
Out[11]:
In [12]:
bd = soup.find_all('section', {'class':'article-item-body'})
There are 24 Jobs listed on times¶
- Jobs
- Stress Tolerance
- Average Salary
- Remarks are parsed into differnt list and then to a pandas data frame
In [13]:
jobs=[]
for i in h2:
jobs.append( i.get_text())
In [14]:
stress = []
aas = []
remarks =[]
for e,i in enumerate(bd):
ii = i.get_text()
for i in ii.split('\n'):
if i.startswith("Stress tolerance:"):
st = float( i.split(": ")[1] )
stress.append(st)
elif i.startswith("Average"):
aa = i.split(": ")[1]
aa = int( ''.join(ch for ch in aa if ch not in exclude) )
aas.append(aa)
elif i.startswith("What"):
rem = i.split(": ")[1]
remarks.append(rem)
else:pass
In [15]:
df = pd.DataFrame(data={'J':jobs, 'ST':stress, 'Sal':aas, 'Remarks':remarks})
In [16]:
df.head()
Out[16]:
In [17]:
df.describe()
Out[17]:
In [18]:
# making a new dataframe
df2 = df[['J','ST','Sal']]
df2.set_index(df2.J, inplace=True,drop=True)
del df2['J']
In [19]:
df2.head()
Out[19]:
In [20]:
g = sns.jointplot("ST","Sal", data=df2, kind="reg", color="r", size=6)
In [21]:
ax = sns.regplot('Sal','ST',data=df2)
In [22]:
df2 = df2.sort_values(by = 'Sal')
ax = sns.barplot(x=df2.Sal, y=df2.index, data=df2 )
In [23]:
df2 = df2.sort_values(by ='ST')
ax = sns.barplot(x=df2.ST, y=df2.index, data=df2 )
In [24]:
# getting the data frame where Stress Tolerence is > 65
df3 = df[df.ST >65].sort_values('Sal')
In [25]:
df3
Out[25]:
In [26]:
for e,i in enumerate( df3.index, start=1):
print(e, df3.loc[i][0],":\t",df3.loc[i][1], end='\n\n')