read unstructured data with readtable function

6 次查看(过去 30 天)
I want to read the data in the attached file, using readtbale given below:
fname1='sample.txt';
opt = detectImportOptions(fname1, 'headerlines', 0);
data = readtable(fname1, opt);
However, the result is not correct because there is a shift of data structure. I plan to get following structure:
Is there any best way to do it? thank you for help

采纳的回答

Kevin Holly
Kevin Holly 2021-9-21
编辑:Kevin Holly 2021-9-21
You can simply use the "Import Data" button on the toolstrip in MATLAB. A user interface will show up and before importing the data as a table, you can click a dropdown box by the "Import Selection" button and select "Generate Script" or "Generate Function" to generate code.
Below is code generated by MATLAB:
%% Read columns of data as text:
% For more information, see the TEXTSCAN documentation.
formatSpec = '%8s%3s%5s%8s%5s%5s%5s%5s%s%[^\n\r]';
%% Open the text file.
fileID = fopen(filename,'r');
%% Read columns of data according to the format.
% This call is based on the structure of the file used to generate this code. If an error occurs for a different file, try regenerating the code from the Import Tool.
dataArray = textscan(fileID, formatSpec, 'Delimiter', '', 'WhiteSpace', '', 'TextType', 'string', 'ReturnOnError', false);
%% Close the text file.
fclose(fileID);
%% Convert the contents of columns containing numeric text to numbers.
% Replace non-numeric text with NaN.
raw = repmat({''},length(dataArray{1}),length(dataArray)-1);
for col=1:length(dataArray)-1
raw(1:length(dataArray{col}),col) = mat2cell(dataArray{col}, ones(length(dataArray{col}), 1));
end
numericData = NaN(size(dataArray{1},1),size(dataArray,2));
for col=[3,4,5,6,7,8,9]
% Converts text in the input cell array to numbers. Replaced non-numeric text with NaN.
rawData = dataArray{col};
for row=1:size(rawData, 1)
% Create a regular expression to detect and remove non-numeric prefixes and suffixes.
regexstr = '(?<prefix>.*?)(?<numbers>([-]*(\d+[\,]*)+[\.]{0,1}\d*[eEdD]{0,1}[-+]*\d*[i]{0,1})|([-]*(\d+[\,]*)*[\.]{1,1}\d+[eEdD]{0,1}[-+]*\d*[i]{0,1}))(?<suffix>.*)';
try
result = regexp(rawData(row), regexstr, 'names');
numbers = result.numbers;
% Detected commas in non-thousand locations.
invalidThousandsSeparator = false;
if numbers.contains(',')
thousandsRegExp = '^[-/+]*\d+?(\,\d{3})*\.{0,1}\d*$';
if isempty(regexp(numbers, thousandsRegExp, 'once'))
numbers = NaN;
invalidThousandsSeparator = true;
end
end
% Convert numeric text to numbers.
if ~invalidThousandsSeparator
numbers = textscan(char(strrep(numbers, ',', '')), '%f');
numericData(row, col) = numbers{1};
raw{row, col} = numbers{1};
end
catch
raw{row, col} = rawData{row};
end
end
end
% Convert the contents of columns with dates to MATLAB datetimes using the specified date format.
try
dates{1} = datetime(dataArray{1}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss');
catch
try
% Handle dates surrounded by quotes
dataArray{1} = cellfun(@(x) x(2:end-1), dataArray{1}, 'UniformOutput', false);
dates{1} = datetime(dataArray{1}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss');
catch
dates{1} = repmat(datetime([NaN NaN NaN]), size(dataArray{1}));
end
end
dates = dates(:,1);
%% Split data into numeric and string columns.
rawNumericColumns = raw(:, [3,4,5,6,7,8,9]);
rawStringColumns = string(raw(:, 2));
%% Make sure any text containing <undefined> is properly converted to an <undefined> categorical
idx = (rawStringColumns(:, 1) == "<undefined>");
rawStringColumns(idx, 1) = "";
%% Create output variable
test = table;
test.VarName1 = dates{:, 1};
test.VarName2 = categorical(rawStringColumns(:, 1));
test.VarName3 = cell2mat(rawNumericColumns(:, 1));
test.VarName4 = cell2mat(rawNumericColumns(:, 2));
test.VarName5 = cell2mat(rawNumericColumns(:, 3));
test.VarName6 = cell2mat(rawNumericColumns(:, 4));
test.VarName7 = cell2mat(rawNumericColumns(:, 5));
test.VarName8 = cell2mat(rawNumericColumns(:, 6));
test.VarName9 = cell2mat(rawNumericColumns(:, 7));
% For code requiring serial dates (datenum) instead of datetime, uncomment the following line(s) below to return the imported dates as datenum(s).
% test.VarName1=datenum(test.VarName1);
%% Clear temporary variables
clearvars filename formatSpec fileID dataArray ans raw col numericData rawData row regexstr result numbers invalidThousandsSeparator thousandsRegExp dates blankDates anyBlankDates invalidDates anyInvalidDates rawNumericColumns rawStringColumns idx;
This is what "Generate Function" creates:
function test1 = importfile(filename, startRow, endRow)
%IMPORTFILE Import numeric data from a text file as a matrix.
% TEST1 = IMPORTFILE(FILENAME)
% Reads data from text file FILENAME for the default selection.
%
% TEST1 = IMPORTFILE(FILENAME, STARTROW, ENDROW)
% Reads data from rows STARTROW through ENDROW of text file FILENAME.
%
% Example:
% test1 = importfile('test.txt', 1, 16);
%
% See also TEXTSCAN.
% Auto-generated by MATLAB on 2021/09/21 10:57:33
%% Initialize variables.
if nargin<=2
startRow = 1;
endRow = inf;
end
%% Read columns of data as text:
% For more information, see the TEXTSCAN documentation.
formatSpec = '%8s%3s%5s%8s%5s%5s%5s%5s%s%[^\n\r]';
%% Open the text file.
fileID = fopen(filename,'r');
%% Read columns of data according to the format.
% This call is based on the structure of the file used to generate this code. If an error occurs for a different file, try regenerating the code from the Import Tool.
dataArray = textscan(fileID, formatSpec, endRow(1)-startRow(1)+1, 'Delimiter', '', 'WhiteSpace', '', 'TextType', 'string', 'HeaderLines', startRow(1)-1, 'ReturnOnError', false, 'EndOfLine', '\r\n');
for block=2:length(startRow)
frewind(fileID);
dataArrayBlock = textscan(fileID, formatSpec, endRow(block)-startRow(block)+1, 'Delimiter', '', 'WhiteSpace', '', 'TextType', 'string', 'HeaderLines', startRow(block)-1, 'ReturnOnError', false, 'EndOfLine', '\r\n');
for col=1:length(dataArray)
dataArray{col} = [dataArray{col};dataArrayBlock{col}];
end
end
%% Close the text file.
fclose(fileID);
%% Convert the contents of columns containing numeric text to numbers.
% Replace non-numeric text with NaN.
raw = repmat({''},length(dataArray{1}),length(dataArray)-1);
for col=1:length(dataArray)-1
raw(1:length(dataArray{col}),col) = mat2cell(dataArray{col}, ones(length(dataArray{col}), 1));
end
numericData = NaN(size(dataArray{1},1),size(dataArray,2));
for col=[3,4,5,6,7,8,9]
% Converts text in the input cell array to numbers. Replaced non-numeric text with NaN.
rawData = dataArray{col};
for row=1:size(rawData, 1)
% Create a regular expression to detect and remove non-numeric prefixes and suffixes.
regexstr = '(?<prefix>.*?)(?<numbers>([-]*(\d+[\,]*)+[\.]{0,1}\d*[eEdD]{0,1}[-+]*\d*[i]{0,1})|([-]*(\d+[\,]*)*[\.]{1,1}\d+[eEdD]{0,1}[-+]*\d*[i]{0,1}))(?<suffix>.*)';
try
result = regexp(rawData(row), regexstr, 'names');
numbers = result.numbers;
% Detected commas in non-thousand locations.
invalidThousandsSeparator = false;
if numbers.contains(',')
thousandsRegExp = '^[-/+]*\d+?(\,\d{3})*\.{0,1}\d*$';
if isempty(regexp(numbers, thousandsRegExp, 'once'))
numbers = NaN;
invalidThousandsSeparator = true;
end
end
% Convert numeric text to numbers.
if ~invalidThousandsSeparator
numbers = textscan(char(strrep(numbers, ',', '')), '%f');
numericData(row, col) = numbers{1};
raw{row, col} = numbers{1};
end
catch
raw{row, col} = rawData{row};
end
end
end
% Convert the contents of columns with dates to MATLAB datetimes using the specified date format.
try
dates{1} = datetime(dataArray{1}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss');
catch
try
% Handle dates surrounded by quotes
dataArray{1} = cellfun(@(x) x(2:end-1), dataArray{1}, 'UniformOutput', false);
dates{1} = datetime(dataArray{1}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss');
catch
dates{1} = repmat(datetime([NaN NaN NaN]), size(dataArray{1}));
end
end
dates = dates(:,1);
%% Split data into numeric and string columns.
rawNumericColumns = raw(:, [3,4,5,6,7,8,9]);
rawStringColumns = string(raw(:, 2));
%% Make sure any text containing <undefined> is properly converted to an <undefined> categorical
idx = (rawStringColumns(:, 1) == "<undefined>");
rawStringColumns(idx, 1) = "";
%% Create output variable
test1 = table;
test1.VarName1 = dates{:, 1};
test1.VarName2 = categorical(rawStringColumns(:, 1));
test1.VarName3 = cell2mat(rawNumericColumns(:, 1));
test1.VarName4 = cell2mat(rawNumericColumns(:, 2));
test1.VarName5 = cell2mat(rawNumericColumns(:, 3));
test1.VarName6 = cell2mat(rawNumericColumns(:, 4));
test1.VarName7 = cell2mat(rawNumericColumns(:, 5));
test1.VarName8 = cell2mat(rawNumericColumns(:, 6));
test1.VarName9 = cell2mat(rawNumericColumns(:, 7));
% For code requiring serial dates (datenum) instead of datetime, uncomment the following line(s) below to return the imported dates as datenum(s).
% test1.VarName1=datenum(test1.VarName1);
  3 个评论
Kevin Holly
Kevin Holly 2021-9-21
编辑:Kevin Holly 2021-9-21
Maruki,
I imported the data shown above with the previous data and generated the following functon:
function test3 = importfile(filename, startRow, endRow)
%IMPORTFILE Import numeric data from a text file as a matrix.
% TEST3 = IMPORTFILE(FILENAME)
% Reads data from text file FILENAME for the default selection.
%
% TEST3 = IMPORTFILE(FILENAME, STARTROW, ENDROW)
% Reads data from rows STARTROW through ENDROW of text file FILENAME.
%
% Example:
% test3 = importfile('test3.txt', 1, 27);
%
% See also TEXTSCAN.
% Auto-generated by MATLAB on 2021/09/21 11:44:28
%% Initialize variables.
if nargin<=2
startRow = 1;
endRow = inf;
end
%% Read columns of data as text:
% For more information, see the TEXTSCAN documentation.
formatSpec = '%8s%3s%5s%8s%5s%5s%5s%5s%s%[^\n\r]';
%% Open the text file.
fileID = fopen(filename,'r');
%% Read columns of data according to the format.
% This call is based on the structure of the file used to generate this code. If an error occurs for a different file, try regenerating the code from the Import Tool.
dataArray = textscan(fileID, formatSpec, endRow(1)-startRow(1)+1, 'Delimiter', '', 'WhiteSpace', '', 'TextType', 'string', 'HeaderLines', startRow(1)-1, 'ReturnOnError', false, 'EndOfLine', '\r\n');
for block=2:length(startRow)
frewind(fileID);
dataArrayBlock = textscan(fileID, formatSpec, endRow(block)-startRow(block)+1, 'Delimiter', '', 'WhiteSpace', '', 'TextType', 'string', 'HeaderLines', startRow(block)-1, 'ReturnOnError', false, 'EndOfLine', '\r\n');
for col=1:length(dataArray)
dataArray{col} = [dataArray{col};dataArrayBlock{col}];
end
end
%% Close the text file.
fclose(fileID);
%% Convert the contents of columns containing numeric text to numbers.
% Replace non-numeric text with NaN.
raw = repmat({''},length(dataArray{1}),length(dataArray)-1);
for col=1:length(dataArray)-1
raw(1:length(dataArray{col}),col) = mat2cell(dataArray{col}, ones(length(dataArray{col}), 1));
end
numericData = NaN(size(dataArray{1},1),size(dataArray,2));
for col=[3,4,5,6,7,8,9]
% Converts text in the input cell array to numbers. Replaced non-numeric text with NaN.
rawData = dataArray{col};
for row=1:size(rawData, 1)
% Create a regular expression to detect and remove non-numeric prefixes and suffixes.
regexstr = '(?<prefix>.*?)(?<numbers>([-]*(\d+[\,]*)+[\.]{0,1}\d*[eEdD]{0,1}[-+]*\d*[i]{0,1})|([-]*(\d+[\,]*)*[\.]{1,1}\d+[eEdD]{0,1}[-+]*\d*[i]{0,1}))(?<suffix>.*)';
try
result = regexp(rawData(row), regexstr, 'names');
numbers = result.numbers;
% Detected commas in non-thousand locations.
invalidThousandsSeparator = false;
if numbers.contains(',')
thousandsRegExp = '^[-/+]*\d+?(\,\d{3})*\.{0,1}\d*$';
if isempty(regexp(numbers, thousandsRegExp, 'once'))
numbers = NaN;
invalidThousandsSeparator = true;
end
end
% Convert numeric text to numbers.
if ~invalidThousandsSeparator
numbers = textscan(char(strrep(numbers, ',', '')), '%f');
numericData(row, col) = numbers{1};
raw{row, col} = numbers{1};
end
catch
raw{row, col} = rawData{row};
end
end
end
% Convert the contents of columns with dates to MATLAB datetimes using the specified date format.
try
dates{1} = datetime(dataArray{1}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss');
catch
try
% Handle dates surrounded by quotes
dataArray{1} = cellfun(@(x) x(2:end-1), dataArray{1}, 'UniformOutput', false);
dates{1} = datetime(dataArray{1}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss');
catch
dates{1} = repmat(datetime([NaN NaN NaN]), size(dataArray{1}));
end
end
dates = dates(:,1);
%% Split data into numeric and string columns.
rawNumericColumns = raw(:, [3,4,5,6,7,8,9]);
rawStringColumns = string(raw(:, 2));
%% Replace non-numeric cells with NaN
R = cellfun(@(x) ~isnumeric(x) && ~islogical(x),rawNumericColumns); % Find non-numeric cells
rawNumericColumns(R) = {NaN}; % Replace non-numeric cells
%% Make sure any text containing <undefined> is properly converted to an <undefined> categorical
idx = (rawStringColumns(:, 1) == "<undefined>");
rawStringColumns(idx, 1) = "";
%% Create output variable
test3 = table;
test3.VarName1 = dates{:, 1};
test3.VarName2 = categorical(rawStringColumns(:, 1));
test3.VarName3 = cell2mat(rawNumericColumns(:, 1));
test3.VarName4 = cell2mat(rawNumericColumns(:, 2));
test3.VarName5 = cell2mat(rawNumericColumns(:, 3));
test3.VarName6 = cell2mat(rawNumericColumns(:, 4));
test3.VarName7 = cell2mat(rawNumericColumns(:, 5));
test3.VarName8 = cell2mat(rawNumericColumns(:, 6));
test3.VarName9 = cell2mat(rawNumericColumns(:, 7));
% For code requiring serial dates (datenum) instead of datetime, uncomment the following line(s) below to return the imported dates as datenum(s).
% test3.VarName1=datenum(test3.VarName1);
This should place NaN where values are missing.

请先登录,再进行评论。

更多回答(1 个)

Walter Roberson
Walter Roberson 2021-9-21
The table is fixed width. Use fixed width import options; https://www.mathworks.com/help/matlab/ref/matlab.io.text.fixedwidthimportoptions.html

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!

Translated by