Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- function MSE = task1(companyDataInputFileName, sectorDataInputFileName)
- % Constant Date Format used in the input files.
- DATE_FORMAT = 'dd/mm/yyyy';
- % Constant First Value of 2009.
- FIRST_DATA = 254;
- % Constant number of Folds.
- FOLD_COUNT = uint8(5);
- % Constant Column References
- DATE = 1;
- COMPANY_STOCK_VOLUME = 2;
- COMPANY_STOCK_VALUE = 3;
- SECTOR_STOCK_VOLUME = 4;
- SECTOR_STOCK_VALUE = 5;
- DELTA_COMPANY_VOLUME = 6;
- DELTA_COMPANY_VALUE = 7;
- DELTA_SECTOR_VOLUME = 8;
- DELTA_SECTOR_VALUE = 9;
- % Load data from files
- companyData = loadCSVData(companyDataInputFileName);
- sectorData = loadCSVData(sectorDataInputFileName);
- % Combine company and sector data.
- combinedData = [companyData(:,:) sectorData(:,2:3)];
- % Create data points of vectors.
- featureData = chooseFeatures(combinedData);
- % Create folds.
- folds = createFolds(featureData, FOLD_COUNT);
- % Initialise squared errors to a vector of zeros.
- squaredErrors = zeros(FOLD_COUNT, 1);
- for i = 1:FOLD_COUNT
- [trainingFolds, validationFold] = distributeFolds(folds, i);
- theta = calculateTheta(trainingFolds);
- % Add squared error for this fold to the vector.
- squaredErrors(i) = calculateSquaredError(theta, validationFold);
- end
- % Return the mean of the squared errors.
- MSE = sum(squaredErrors) / length(featureData);
- % Load data from a csv file.
- function data = loadCSVData(filename)
- f = fopen(filename, 'r');
- % Import all data.
- data = textscan(f, repmat('%s',1,3), 'delimiter',',', 'CollectOutput',true);
- % Remove header
- data = data{1}(2:end,:);
- fclose(f);
- clearvars f;
- end
- % Create data with the chosen features.
- function featureData = chooseFeatures(data)
- featureData = zeros(length(data));
- for n = FIRST_DATA:length(data)
- date = datenum(data(n, 1), DATE_FORMAT);
- history = lastDays(date, data, 100);
- inputs = [
- % Constant
- 1;
- %---------------%
- % FEATURE SET 1 %
- %---------------%
- %{
- % last 5 days of company's stock prices.
- history(end-5:end-1, COMPANY_STOCK_VALUE);
- % Last 3 days of sector's average stock price.
- history(end-3:end-1, SECTOR_STOCK_VALUE);
- % Day of the week.
- weekday(date);
- %}
- %---------------%
- % FEATURE SET 2 %
- %---------------%
- %{
- % last 3 days of company's stock prices.
- history(end-3:end-1, COMPANY_STOCK_VALUE);
- % Last 5 days of sector's average stock price.
- history(end-5:end-1, SECTOR_STOCK_VALUE);
- % Day of the week.
- weekday(date);
- %}
- %---------------%
- % FEATURE SET 3 %
- %---------------%
- %{
- % last 10 days of company's stock prices.
- history(end-10:end-1, COMPANY_STOCK_VALUE);
- %}
- %---------------%
- % FEATURE SET 4 %
- %---------------%
- %{
- % last 2 days of company's stock prices.
- history(end-2:end-1, COMPANY_STOCK_VALUE);
- %}
- %---------------%
- % FEATURE SET 5 %
- %---------------%
- %%{
- % last 5 days of company's stock prices.
- history(end-5:end-1, COMPANY_STOCK_VALUE);
- % last 3 differences of sector average stock price.
- history(end-3:end-1, DELTA_COMPANY_VALUE);
- %}
- %---------------%
- % FEATURE SET 6 %
- %---------------%
- %%{
- %%}
- %---------------%
- % FEATURE SET 7 %
- %---------------%
- %%{
- %%}
- %---------------%
- % FEATURE SET 8 %
- %---------------%
- %%{
- %%}
- %---------------%
- % FEATURE SET 9 %
- %---------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 10 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 11 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 12 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 13 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 14 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 15 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 16 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 17 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 18 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 19 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 20 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 21 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 22 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 23 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 24 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 25 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 26 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 27 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 28 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 29 %
- %----------------%
- %%{
- %%}
- %----------------%
- % FEATURE SET 30 %
- %----------------%
- %%{
- %%}
- ]';
- % Ouput is company stock price.
- output = history(end, 3);
- % Combine inputs and outputs.
- featureData(n, 1:length(inputs)+1) = [inputs output];
- end
- % Shrink featureData down to what's used.
- featureData = featureData(:, 1:length(inputs)+1);
- % Only take the valid data.
- featureData = featureData(FIRST_DATA:end, :);
- clearvars numOfFeatures date history inputs output;
- end
- % Split the data into K folds, or equally-sized subsamples of the data.
- function folds = createFolds(data, k)
- randomData = data(randperm(length(data)), :);
- foldSize = idivide(length(data), k);
- folds = {
- randomData(1+0*foldSize:1*foldSize, :)
- randomData(1+1*foldSize:2*foldSize, :)
- randomData(1+2*foldSize:3*foldSize, :)
- randomData(1+3*foldSize:4*foldSize, :)
- randomData(1+4*foldSize:end, :)
- };
- clearvars randomData foldSize;
- end
- % Create training and validation folds from some folds.
- function [training, validation] = distributeFolds(folds, i)
- validationFoldVector = folds(i);
- validation = validationFoldVector{1};
- trainingFoldVector = folds([1:i-1 i+1:end]);
- training = vertcat(trainingFoldVector{1:end});
- clearvars validationFoldVector trainingFoldVector;
- end
- % Calculates theta (the regression coefficients)
- function theta = calculateTheta(trainingData)
- % Input matrix
- X = trainingData(:, 1:end-1);
- % Output vector
- Y = trainingData(:, end);
- theta = pinv(X'*X)*X'*Y;
- clearvars X Y;
- end
- % Return the sum of the squared errors.
- function squaredError = calculateSquaredError(theta, validationData)
- % Input matrix
- X = validationData(:, 1:end-1);
- % Target vector
- target = validationData(:, end);
- % Output vector
- output = X * theta;
- % Sum the Squared Error
- squaredError = sum((target - output).^2);
- clearvars X target output;
- end
- % Return the most recent data preceding the date given (if there is
- % data for that day, use it), and also get the last X data points
- % before it, where X = days parameter.
- function subset = lastDays(dateNum, data, days)
- to = find(strcmp(data(:,1), datestr(dateNum, DATE_FORMAT)));
- from = to - days;
- % Remove dates (not used after this point).
- firstcolumn = zeros(to-from+1, 1);
- % Original values from company and sector stock data.
- % Removes date.
- values = str2double(data(from:to,2:end));
- % Changes in the above data (also doesn't include date).
- deltas = str2double(data(from:to,2:end)) - str2double(data(from-1:to-1,2:end));
- % Return the above data.
- subset = [firstcolumn values deltas];
- clearvars from to deltas firstcolumn values deltas;
- end
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement