import pandas as pdimport redf = pd.read_excel('marriage.xlsx')df.head()獲取到的數據裡,居住地是各地區的,為了便於分析,需要處理成省級行政區,學歷/月薪那一列數據,有些是月薪,有些是學歷,可以分別處理成兩列數據,是學歷的,提取出學歷層次,月薪標記為 「未知」;是月薪的,提取出月薪並計算,學歷標記為 「未知」。with open('地區.txt', 'r', encoding='utf-8') as f: area = f.read().split('\n')
print(area)print(len(area))['北京', '上海', '天津', '重慶', '黑龍江', '吉林', '遼寧', '內蒙古', '河北', '新疆', '甘肅', '青海', '陝西', '寧夏', '河南', '山東', '山西', '安徽', '湖北', '湖南', '江蘇', '四川', '貴州', '雲南', '廣西', '西藏', '浙江', '江西', '廣東', '福建', '臺灣', '海南', '香港', '澳門']34
areas_list = []for i in df['居住地']: for j in area: if j in i: areas_list.append(j) break else: areas_list.append('未知')
df['居住地'] = areas_listdf.head()結果如下:with open('學歷.txt', 'r', encoding='utf-8') as fp: edu = fp.read().split('\n')
print(edu)['博士', '碩士', '本科', '大專', '中專', '高中', '初中', '小學']
salary_list = []edu_list = []for item in df['學歷/月薪']: if '元' in item: # 這一列的數據是表達月薪的話 計算 data = re.findall('\d+', item) data = [int(x) for x in data] salary = int(sum(data) / len(data)) # 取整 salary_list.append(salary) edu_list.append('未知') else: salary_list.append('未知') for e in edu: if e in item: edu_list.append(e) break else: edu_list.append('未知')
print(len(edu_list))print(len(salary_list))df['學歷'] = edu_listdf['月薪'] = salary_listdf.head()結果如下:
這時候數據處理好了,可以刪掉學歷/月薪這一列,再重新保存到Excel。df.to_excel('處理後數據.xlsx', index=False)
數據分析
"""@File :男女佔比情況.py@Author :葉庭雲@CSDN :https://yetingyun.blog.csdn.net/"""import pandas as pdimport collectionsfrom pyecharts.charts import Piefrom pyecharts import options as optsfrom pyecharts.globals import ThemeType, CurrentConfig
CurrentConfig.ONLINE_HOST = 'D:/python/pyecharts-assets-master/assets/'df = pd.read_excel('處理後數據.xlsx')gender = list(df['性別'])gender_count = collections.Counter(gender).most_common()gender_count = [(k, v) for k, v in gender_count]
pie = Pie(init_opts=opts.InitOpts(theme=ThemeType.MACARONS))pie.add('性別', data_pair=gender_count, radius=["40%", "55%"], label_opts=opts.LabelOpts( position="outside", formatter="{a|{a}}{abg|}\n{hr|}\n {b|{b}: }{c} {per|{d}%} ", background_color="#eee", border_color="#aaa", border_width=1, border_radius=4, rich={ "a": {"color": "#999", "lineHeight": 22, "align": "center"}, "abg": { "backgroundColor": "#e3e3e3", "width": "100%", "align": "right", "height": 22, "borderRadius": [4, 4, 0, 0], }, "hr": { "borderColor": "#aaa", "width": "100%", "borderWidth": 0.5, "height": 0, }, "b": {"fontSize": 16, "lineHeight": 33}, "per": { "color": "#eee", "backgroundColor": "#334455", "padding": [2, 4], "borderRadius": 2, }, }, ),)pie.set_global_opts(title_opts=opts.TitleOpts(title='相親男女佔比情況'))pie.set_colors(['red', 'blue']) pie.render('男女佔比情況.html')相親男女中男士有 25910 人,佔比 45.72%;女士有 30767 人,佔比 54.28%。參加相親人數中女士多於男士。"""@File :年齡分布.py@Author :葉庭雲@CSDN :https://yetingyun.blog.csdn.net/"""import pandas as pdimport collectionsfrom pyecharts.charts import Barfrom pyecharts.globals import ThemeType, CurrentConfigfrom pyecharts import options as opts
CurrentConfig.ONLINE_HOST = 'D:/python/pyecharts-assets-master/assets/'
df = pd.read_excel('處理後數據.xlsx')age = list(df['年齡'])age_count = collections.Counter(age).most_common()age_count.sort(key=lambda x: x[0])age = [x[0] for x in age_count]nums = [y[1] for y in age_count]
bar = Bar(init_opts=opts.InitOpts(theme=ThemeType.MACARONS))bar.add_xaxis(age)bar.add_yaxis('人數', nums) bar.set_global_opts(title_opts=opts.TitleOpts(title='相親男女年齡分布'))bar.set_series_opts(label_opts=opts.LabelOpts(is_show=False), markpoint_opts=opts.MarkPointOpts( data=[ opts.MarkPointItem(type_="max", name="最大值"), opts.MarkPointItem(type_="min", name="最小值"), opts.MarkPointItem(type_="average", name="平均值")]), markline_opts=opts.MarkLineOpts( data=[ opts.MarkLineItem(type_="average", name="平均值")]))bar.render('年齡分布.html')
31 歲的相親男女人數最多,有 2637 人,各個年齡段都有一定數量的人,我們將年齡小於等於 20 歲,大於等於 70 歲的相親男女數據單獨提取出來看看。import pandas as pd
df = pd.read_excel('處理後數據.xlsx')df1 = df[df['年齡'] <= 20]df2 = df1['婚況'].value_counts() print(df2)
結果如下:未婚 153離異 6喪偶 2Name: 婚況, dtype: int64大部分是未婚,年紀輕輕就那麼急著相親嗎?再看看婚況是離異、喪偶的數據,import pandas as pd
df = pd.read_excel('處理後數據.xlsx')df1 = df[df['年齡'] <= 20]
df3 = df1[df1['婚況'] == '離異']print(df3)網名 性別 ... 學歷 月薪17425 微風輕起 男士 ... 未知 5000029645 媳婦 女士 ... 大專 未知30398 仙妹 女士 ... 高中 未知30485 會員1415395937 男士 ... 未知 3500036684 微笑著變老 女士 ... 高中 未知49864 風吹動了風玲 女士 ... 高中 未知
[6 rows x 9 columns]月薪寫著 50000、35000 的男士有些顯眼啊,取數據集中查看。
月薪 50000 的微風輕起,徵婚信息年齡寫的 19,徵婚宣言裡又寫到 1994 年 26 歲;月薪 35000 的會員某某某,徵婚信息年齡寫的 20,徵婚宣言裡又寫到 81 年的,看來網站裡年齡、身高這些信息真實性值得懷疑。"""@File :男女佔比情況.py@Author :葉庭雲@CSDN :https://yetingyun.blog.csdn.net/"""import pandas as pdimport collectionsfrom pyecharts.charts import Piefrom pyecharts import options as optsfrom pyecharts.globals import ThemeType, CurrentConfig
CurrentConfig.ONLINE_HOST = 'D:/python/pyecharts-assets-master/assets/'df = pd.read_excel('處理後數據.xlsx')data = df[df['婚況'] != '未填寫']data_count = collections.Counter(data['婚況']).most_common()print(data)
c = ( Pie() .add( "婚況", data_count, radius=["40%", "55%"], label_opts=opts.LabelOpts( position="outside", formatter="{a|{a}}{abg|}\n{hr|}\n {b|{b}: }{c} {per|{d}%} ", background_color="#eee", border_color="#aaa", border_width=1, border_radius=4, rich={ "a": {"color": "#999", "lineHeight": 22, "align": "center"}, "abg": { "backgroundColor": "#e3e3e3", "width": "100%", "align": "right", "height": 22, "borderRadius": [4, 4, 0, 0], }, "hr": { "borderColor": "#aaa", "width": "100%", "borderWidth": 0.5, "height": 0, }, "b": {"fontSize": 16, "lineHeight": 33}, "per": { "color": "#eee", "backgroundColor": "#334455", "padding": [2, 4], "borderRadius": 2, }, }, ), ) .set_colors(["#8B008B", "#FF1493", "#000000"]) .set_global_opts(title_opts=opts.TitleOpts(title="相親男女婚況")) .render("pie_rich_label.html"))
相親男女婚況,離異的佔比 57.67%,未婚佔比 34.14%,喪偶佔比 8.19%。"""@File :學歷分布.py@Author :葉庭雲@CSDN :https://yetingyun.blog.csdn.net/"""import pandas as pdimport collectionsfrom pyecharts.charts import Piefrom pyecharts import options as optsfrom pyecharts.globals import CurrentConfig
CurrentConfig.ONLINE_HOST = 'D:/python/pyecharts-assets-master/assets/'df = pd.read_excel('處理後數據.xlsx')data = df[df['學歷'] != '未知']data_count = collections.Counter(data['學歷']).most_common()c = ( Pie(init_opts=opts.InitOpts(width="800px", height="500px", bg_color="#2c343c")) .add( series_name="相親男女學歷", data_pair=data_count, rosetype="radius", radius="55%", center=["50%", "50%"], label_opts=opts.LabelOpts(is_show=False, position="center"), ) .set_colors(["#00BFFF", "#00FF7F", "#FF1493", "#8B008B", "#FFFF00", "#556B2F"]) .set_global_opts( title_opts=opts.TitleOpts( title="相親男女學歷", pos_left="center", pos_top="20", title_textstyle_opts=opts.TextStyleOpts(color="#fff"), ), legend_opts=opts.LegendOpts(is_show=False), ) .set_series_opts( tooltip_opts=opts.TooltipOpts( trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)" ), label_opts=opts.LabelOpts(color="#fff"), ) .render("相親男女學歷.html"))
相親男女學歷大部分在高中(35.92%)、大專(24.72%),有近六成的相親男女,本科佔比 20.7%,中專佔比 16.35%,碩士、博士高學歷的相親男女人數較少,分別佔比 2.14%,0.17%。"""@File :地區分布.py@Author :葉庭雲@CSDN :https://yetingyun.blog.csdn.net/"""import pandas as pdimport collectionsfrom pyecharts import options as optsfrom pyecharts.charts import Geofrom pyecharts.globals import ChartTypefrom pyecharts.globals import ThemeType, CurrentConfig
CurrentConfig.ONLINE_HOST = 'D:/python/pyecharts-assets-master/assets/'
df = pd.read_excel('處理後數據.xlsx')area = list(df['居住地'])area_count = collections.Counter(area).most_common(34)print(area_count)
geo = Geo(init_opts=opts.InitOpts(width='1000px', height='600px', theme=ThemeType.DARK))geo.add_schema(maptype='china', label_opts=opts.LabelOpts(is_show=True))geo.add('相親男女人數', data_pair=area_count, type_=ChartType.EFFECT_SCATTER)geo.set_series_opts(label_opts=opts.LabelOpts(is_show=False)) geo.set_global_opts(title_opts=opts.TitleOpts(title="相親男女地區分布"), visualmap_opts=opts.VisualMapOpts(max_=5000, is_piecewise=True, pieces=[{"max": 1000, "min": 100, "label": "100-1000", "color": "#708090"}, {"max": 1500, "min": 1001, "label": "1001-1500", "color": "#00008B"}, {"max": 2000, "min": 1501, "label": "1501-2000", "color": "#483D8B"}, {"max": 2500, "min": 2001, "label": "2001-2500", "color": "#1E90FF"}, {"max": 3000, "min": 2501, "label": "2501-3000", "color": "#8B008B"}, {"max": 5000, "min": 3001, "label": ">=3000", "color": "#FF0000"}]) )geo.render('地區分布.html')[('重慶', 4436), ('廣東', 2637), ('四川', 2519), ('山東', 2398), ('河南', 2160), ('上海', 2156), ('雲南', 2039), ('北京', 2037), ('臺灣', 1997), ('安徽', 1920), ('江蘇', 1919), ('天津', 1918), ('黑龍江', 1918), ('湖南', 1800), ('新疆', 1799), ('遼寧', 1680), ('甘肅', 1680), ('廣西', 1679), ('湖北', 1679), ('內蒙古', 1559), ('山西', 1440), ('福建', 1440), ('江西', 1440), ('浙江', 1440), ('陝西', 1439), ('河北', 1439), ('青海', 1339), ('貴州', 1200), ('吉林', 1080), ('西藏', 942), ('寧夏', 702), ('海南', 360), ('香港', 353), ('澳門', 117)]徵婚宣言一般是介紹自己情況,表達對另一半的要求和期望,下面我們分別來看看相親男女徵婚宣言裡關鍵詞都有些什麼。"""@File :徵婚宣言詞雲.py@Author :葉庭雲@CSDN :https://yetingyun.blog.csdn.net/"""import pandas as pdimport jiebaimport collectionsimport refrom wordcloud import WordCloudimport matplotlib.pyplot as pltimport numpy as npfrom PIL import Image
df = pd.read_excel('處理後數據.xlsx')[['性別', '徵婚宣言']]df2 = df[df['性別'] == '女士']['徵婚宣言']
with open('stop_words.txt', encoding='utf-8') as f: con = f.read().split('\n') stop_words = set() for i in con: stop_words.add(i)
result_list = []for data in df2: new_data = re.findall('[\u4e00-\u9fa5]+', data, re.S) new_data = "/".join(new_data) seg_list_exact = jieba.cut(new_data, cut_all=True) for word in seg_list_exact: if word not in stop_words and len(word) > 1: result_list.append(word)
print(result_list)word_counts = collections.Counter(result_list)mask_ = 255 - np.array(Image.open('woman_mask.png'))my_cloud = WordCloud( background_color='white', mask=mask_, font_path='simhei.ttf', max_font_size=112, min_font_size=12, random_state=88 ).generate_from_frequencies(word_counts)
plt.figure(figsize=(8, 5), dpi=200)plt.imshow(my_cloud, interpolation='bilinear')plt.axis('off')plt.savefig('woman_cloud.png', dpi=200)plt.show()結果如下:
相親男女徵婚宣言裡,喜歡、希望、生活、善良、真誠、真心、幸福、性格都是出現頻率高的詞語。程式設計師如何避免陷入「內卷」、選擇什麼技術最有前景,中國開發者現狀與技術趨勢究竟是什麼樣?快來參與「2020 中國開發者大調查」,更有豐富獎品送不停!
☞估值飆至 280 億美元,Databricks G 輪融資 10 億美元,誰說開源不掙錢?
☞編程網站 Perl.com 被劫,售價 19 萬美元