Weather big data analysis system based on hive (conduct big data analysis through hive, import the analyzed data to MySQL through sqoop, and visualize the data based on MySQL through Django)
- Introduction to Hive: Hive is a data warehouse infrastructure built on top of Hadoop, which provides a SQL-like language (HQL) to query and analyze large-scale data sets. With Hive, we can do complex data processing and analysis in a distributed storage system.
-
Introduction to Sqoop: Sqoop is a tool for transferring data between Apache Hadoop and relational databases. We can use Sqoop to export the analysis results from Hive to a relational database, such as MySQL, for further processing and visualization.
-
Django Overview: Django is an advanced Python Web framework that provides a collection of tools and libraries for quickly building Web applications. We can leverage Django to connect to a MySQL database, process the data, and present it as a visual interface.
Hive big data analysis SQL, create hive table based on data, and then perform data analysis
-- Create database
CREATE DATABASE IF NOT EXISTS big_data;
-- switch tobig_datadatabase
USE big_data;
load data local inpath '/export/server/28' INTO TABLE weather_data;
-- createweather_datasurface
CREATE TABLE IF NOT EXISTS weather_data (
`date` STRING,
high_temperature STRING,
low_temperature STRING,
weather STRING,
wind_direction STRING,
city STRING
)ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
-- Insert data intoweather_datasurface(Sample data)
INSERT INTO TABLE weather_data VALUES
(1, '2022-01-01 Saturday', '6°', '-7°', 'clear', 'Northwest wind3class', 'Beijing'),
(2, '2022-01-02 Sunday', '2°', '-7°', 'partly cloudy', 'south wind2class', 'Beijing');
-- createetl_weather_datasurface
CREATE TABLE IF NOT EXISTS etl_weather_data (
`date` STRING,
day_of_week STRING,
high_temperature INT,
low_temperature INT,
weather STRING,
wind_direction STRING,
wind_speed STRING,
city STRING
);
-- Insert data intoetl_weather_datasurface
INSERT INTO TABLE etl_weather_data
SELECT
SUBSTR(`date`, 1, INSTR(`date`, ' ') - 1) AS `date`,
SUBSTR(`date`, INSTR(`date`, ' ') + 1) AS day_of_week,
CAST(SUBSTR(high_temperature, 1, INSTR(high_temperature, '°') - 1) AS INT) AS high_temperature,
CAST(SUBSTR(low_temperature, 1, INSTR(low_temperature, '°') - 1) AS INT) AS low_temperature,
weather,
REGEXP_REPLACE(SUBSTR(wind_direction, 1, INSTR(wind_direction, 'class') - 1), '[0-9]', '') AS wind_direction,
SUBSTR(SUBSTR(wind_direction, INSTR(wind_direction, 'wind') + 1),1,1) AS wind_speed,
city
FROM
weather_data;
-- 1.Count the number of sunny days in each city in a yeartop10
CREATE TABLE IF NOT EXISTS top_sunny_cities (
city STRING,
sunny_days_count INT
);
INSERT INTO TABLE top_sunny_cities
SELECT
city,
COUNT(*) AS sunny_days_count
FROM
etl_weather_data
WHERE
weather LIKE '%clear%'
GROUP BY
city
ORDER BY
sunny_days_count DESC
LIMIT 10;
-- 2.Statistics on temperature difference changes in Beijing every month of the year
CREATE TABLE IF NOT EXISTS monthly_max_temperature_difference (
month_year STRING,
max_temperature_difference INT
);
INSERT INTO TABLE monthly_max_temperature_difference
SELECT
CONCAT(YEAR(`date`), '-', LPAD(MONTH(`date`), 2, '0')) AS month_year,
MAX(high_temperature - low_temperature) AS max_temperature_difference
FROM
etl_weather_data
WHERE
city = 'Beijing'
GROUP BY
YEAR(`date`), MONTH(`date`);
-- 3.Statistics city appears3The one with the highest wind speed above grade 110cities
CREATE TABLE IF NOT EXISTS top_cities_high_wind (
city STRING,
high_wind_days_count INT
);
INSERT INTO TABLE top_cities_high_wind
SELECT
city,
COUNT(*) AS high_wind_days_count
FROM
etl_weather_data
WHERE
CAST(wind_speed AS INT) >= 3
GROUP BY
city
ORDER BY
high_wind_days_count DESC
LIMIT 10;
Import data into MySQL based on sqoop
sqoop export \
--connect jdbc:mysql://192.168.138.1:3306/big_data \
--username root --password '123456' \
--table top_sunny_cities_sqoop \
--export-dir /hive/warehouse/big_data.db/big_data.dbbig_data.db/top_sunny_cities \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n';
sqoop export \
--connect jdbc:mysql:// 192.168.138.1:3306/big_data \
--username root --password 123456 \
--table monthly_max_temperature_difference \
--export-dir /user/hive/warehouse/big_data.db/big_data.dbmonthly_max_temperature_difference \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n'
sqoop export \
--connect jdbc:mysql:// 192.168.138.1:3306/big_data \
--username root --password 123456 \
--table top_cities_high_wind \
--export-dir /user/hive/warehouse/big_data.db/big_data.dbtop_cities_high_wind \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n'
Data visualization with Django based on MySQL data
from django.contrib.auth.decorators import login_required
from django.http import HttpResponse
from django.shortcuts import render, redirect
from django.shortcuts import render
from pyecharts import options as opts
from pyecharts.charts import Line, Bar, Pie, Grid
# Create your views here.
from django.shortcuts import render
from pyecharts.globals import ThemeType
from api.service.task_service import get_user, top_sunny_cities, monthly_max_temperature_difference, \
top_cities_high_wind, top_rainy_cities, monthly_rainy_days, yearly_min_temperatures, daily_wind_speed, \
daily_temperature_difference, register_user
def login_page(request):
if request.method == 'POST':
username = request.POST.get('username')
password = request.POST.get('password')
user = get_user(username,password)
if user is not None:
return redirect('home')
else:
return render(request, 'login.html', {'error_message': 'Invalid login credentials.'})
return render(request, 'login.html')
def register_view(request):
# Handle registration logic
if request.method == 'GET':
username = request.GET.get('username')
password = request.GET.get('password')
if username and password:
register_user(username,password)
return HttpResponse("registration success!")
return render(request, 'register.html') # Use your registration template path
def home(request):
print(2)
return render(request, 'home.html')
def data_analysis(request, button_id):
return render(request, 'data_analysis.html', {'button_id': button_id})
def data_analysis(request, button_id):
# according to button ID perform different treatments
if button_id == 1:
x,y = top_sunny_cities()
line_chart = (
Line()
.add_xaxis(xaxis_data=x)
.add_yaxis(series_name="Number of sunny days", y_axis=y)
.set_global_opts(title_opts=opts.TitleOpts(title="The number of sunny days in each city in a yeartop10"))
)
chart_html = line_chart.render_embed()
button_name = "line chart"
elif button_id == 2:
x,y = monthly_max_temperature_difference()
line_chart = (
Line()
.add_xaxis(xaxis_data=x)
.add_yaxis(series_name="Temperature difference", y_axis=y)
.set_global_opts(title_opts=opts.TitleOpts(title="Temperature difference changes in Beijing every month of the year"))
)
chart_html = line_chart.render_embed()
button_name = "line chart"
elif button_id == 3:
x,y = top_cities_high_wind()
bar_chart = (
Bar()
.add_xaxis(xaxis_data=x)
.add_yaxis(series_name="3Level wind speed times",y_axis=y)
.set_global_opts(title_opts=opts.TitleOpts(title="Appear3wind speed abovetop10cities"))
)
chart_html = bar_chart.render_embed()
button_name = "Bar chart"
elif button_id == 4:
x, y = top_rainy_cities()
bar_chart = (
Bar()
.add_xaxis(xaxis_data=x)
.add_yaxis(series_name="Number of rainy days", y_axis=y)
.set_global_opts(title_opts=opts.TitleOpts(title="rainy citytop10"))
)
chart_html = bar_chart.render_embed()
button_name = "Bar chart"
elif button_id == 5:
x, y = monthly_rainy_days()
pie = Pie()
pie.add("", list(zip(x, y)))
pie.set_global_opts(title_opts={"text": "Changes in rainy days in Hangzhou every month", "subtext": "2022Year"},
legend_opts=opts.LegendOpts(orient="vertical", pos_right="right", pos_top="center"))
chart_html = pie.render_embed()
button_name = "pie chart"
elif button_id == 6:
x, y = yearly_min_temperatures()
line_chart = (
Line()
.add_xaxis(xaxis_data=x)
.add_yaxis(series_name="temperature", y_axis=y)
.set_global_opts(title_opts=opts.TitleOpts(title="The city’s lowest temperature of the yeartop10"))
)
chart_html = line_chart.render_embed()
button_name = "line chart"
elif button_id == 7:
x,y=daily_temperature_difference()
# Create a pie chart
pie = (
Pie(init_opts=opts.InitOpts(width="800px", height="600px"))
.add(
series_name="Nanjing10month1~10No. temperature difference change",
data_pair=list(zip(x, y)),
radius=["40%", "75%"], # Set inner and outer radius,Achieve hollow effect
label_opts=opts.LabelOpts(is_show=True, position="inside"),
)
.set_global_opts(title_opts=opts.TitleOpts(title="Nanjing10month1~10No. temperature difference change"),
legend_opts=opts.LegendOpts(orient="vertical", pos_right="right", pos_top="center"),
)
.set_series_opts( # Set series options,Adjustment is_show threshold
label_opts=opts.LabelOpts(is_show=True)
)
)
chart_html = pie.render_embed()
button_name = "pie chart"
elif button_id == 8:
x,y=daily_wind_speed()
bar_chart = (
Bar()
.add_xaxis(xaxis_data=x)
.add_yaxis(series_name="Wind speed level", y_axis=y)
.set_global_opts(title_opts=opts.TitleOpts(title="Nanjing10Monthly daily wind speed changes"))
)
chart_html = bar_chart.render_embed()
button_name = "Bar chart"
return render(request, 'data_analysis.html', {'chart_html': chart_html, 'button_name': button_name})