Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- url = "http://staff.stat.sinica.edu.tw/fredphoa/HW/HW1/CAmaxTemp.txt"
- raw = pd.read_csv(
- url,
- skiprows=1,
- header=None,
- sep="\t+",
- engine="python",
- )
- columns = {
- "WBAN": "category",
- "Station": "category",
- "State": "category",
- "Period": str,
- "JAN": int,
- "FEB": int,
- "MAR": int,
- "APR": int,
- "MAY": int,
- "JUN": int,
- "JUL": int,
- "AUG": int,
- "SEP": int,
- "OCT": int,
- "NOV": int,
- "DEC": int,
- "MAX": int,
- }
- def extract_period(df):
- time = df["Period"].str.extract("(\d+)-(\d+)").agg(pd.to_datetime, format="%Y%m")
- return df.assign(Start=time[0], End=time[1])
- df = (
- pd.concat(
- (
- raw[0].str.extract(
- "(\d+)([A-Z ]+),(\D+)",
- expand=True,
- ),
- raw[1].str.split(
- "\s+",
- expand=True,
- ),
- ),
- axis=1,
- )
- .set_axis(columns.keys(), axis=1)
- .astype(columns)
- .pipe(extract_period)
- )
- df.head()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement