Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wc: pass GNU test wc-proc and Windows optimization #5612

Merged
merged 11 commits into from
Dec 7, 2023
89 changes: 83 additions & 6 deletions src/uu/wc/src/count_fast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.

// cSpell:ignore sysconf
use crate::word_count::WordCount;

use super::WordCountable;
Expand All @@ -11,11 +13,19 @@ use std::fs::OpenOptions;
use std::io::{self, ErrorKind, Read};

#[cfg(unix)]
use libc::S_IFREG;
use libc::{sysconf, S_IFREG, _SC_PAGESIZE};
#[cfg(unix)]
use nix::sys::stat;
#[cfg(unix)]
use std::io::{Seek, SeekFrom};
#[cfg(any(target_os = "linux", target_os = "android"))]
use std::os::unix::io::AsRawFd;
#[cfg(windows)]
use std::os::windows::fs::MetadataExt;
#[cfg(windows)]
const FILE_ATTRIBUTE_ARCHIVE: u32 = 32;
#[cfg(windows)]
const FILE_ATTRIBUTE_NORMAL: u32 = 128;

#[cfg(any(target_os = "linux", target_os = "android"))]
use libc::S_IFIFO;
Expand Down Expand Up @@ -72,6 +82,8 @@ fn count_bytes_using_splice(fd: &impl AsRawFd) -> Result<usize, usize> {
/// 1. On Unix, we can simply `stat` the file if it is regular.
/// 2. On Linux -- if the above did not work -- we can use splice to count
/// the number of bytes if the file is a FIFO.
/// 3. On Windows we can use `std::os::windows::fs::MetadataExt` to get file size
/// for regular files
/// 3. Otherwise, we just read normally, but without the overhead of counting
/// other things such as lines and words.
#[inline]
Expand All @@ -87,11 +99,60 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> (usize, Opti
// If stat.st_size = 0 then
// - either the size is 0
// - or the size is unknown.
// The second case happens for files in pseudo-filesystems. For
// example with /proc/version and /sys/kernel/profiling. So,
// if it is 0 we don't report that and instead do a full read.
if (stat.st_mode as libc::mode_t & S_IFREG) != 0 && stat.st_size > 0 {
return (stat.st_size as usize, None);
// The second case happens for files in pseudo-filesystems.
// For example with /proc/version.
// So, if it is 0 we don't report that and instead do a full read.
//
// Another thing to consider for files in pseudo-filesystems like /proc, /sys
// and similar is that they could report `st_size` greater than actual content.
// For example /sys/kernel/profiling could report `st_size` equal to
// system page size (typically 4096 on 64bit system), while it's file content
// would count up only to a couple of bytes.
// This condition usually occurs for files in pseudo-filesystems like /proc, /sys
// that report `st_size` in the multiples of system page size.
// In such cases - attempt `seek()` almost to the end of the file
// and then fall back on read to count the rest.
//
// And finally a special case of input redirection in *nix shell:
// `( wc -c ; wc -c ) < file` should return
// ```
// size_of_file
// 0
// ```
// Similarly
// `( head -c1 ; wc -c ) < file` should return
// ```
// first_byte_of_file
// size_of_file - 1
// ```
// Since the input stream from file is treated as continuous across both commands inside ().
// In cases like this, due to `<` redirect, the `stat.st_mode` would report input as a regular file
// and `stat.st_size` would report the size of file on disk
// and NOT the remaining number of bytes in the input stream.
// However, the raw file descriptor in this situation would be equal to `0`
// for STDIN in both invocations.
// Therefore we cannot rely of `st_size` here and should fall back on full read.
if fd > 0 && (stat.st_mode as libc::mode_t & S_IFREG) != 0 && stat.st_size > 0 {
let sys_page_size = unsafe { sysconf(_SC_PAGESIZE) as usize };
if stat.st_size as usize % sys_page_size > 0 {
// regular file or file from /proc, /sys and similar pseudo-filesystems
// with size that is NOT a multiple of system page size
return (stat.st_size as usize, None);
} else if let Some(file) = handle.inner_file() {
// On some platforms `stat.st_blksize` and `stat.st_size`
// are of different types: i64 vs i32
// i.e. MacOS on Apple Silicon (aarch64-apple-darwin),
// Debian Linux on ARM (aarch64-unknown-linux-gnu),
// 32bit i686 targets, etc.
// While on the others they are of the same type.
#[allow(clippy::unnecessary_cast)]
let offset =
stat.st_size as i64 - stat.st_size as i64 % (stat.st_blksize as i64 + 1);

if let Ok(n) = file.seek(SeekFrom::Start(offset as u64)) {
byte_count = n as usize;
}
}
}
#[cfg(any(target_os = "linux", target_os = "android"))]
{
Expand All @@ -107,6 +168,22 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> (usize, Opti
}
}

#[cfg(windows)]
{
if let Some(file) = handle.inner_file() {
if let Ok(metadata) = file.metadata() {
let attributes = metadata.file_attributes();
let size = metadata.file_size();

if (attributes & FILE_ATTRIBUTE_ARCHIVE) != 0
|| (attributes & FILE_ATTRIBUTE_NORMAL) != 0
{
return (size as usize, None);
zhitkoff marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
}

// Fall back on `read`, but without the overhead of counting words and lines.
let mut buf = [0_u8; BUF_SIZE];
loop {
Expand Down
9 changes: 9 additions & 0 deletions src/uu/wc/src/countable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@
pub trait WordCountable: AsRawFd + Read {
type Buffered: BufRead;
fn buffered(self) -> Self::Buffered;
fn inner_file(&mut self) -> Option<&mut File>;
}

#[cfg(not(unix))]
pub trait WordCountable: Read {
type Buffered: BufRead;
fn buffered(self) -> Self::Buffered;
fn inner_file(&mut self) -> Option<&mut File>;
}

impl WordCountable for StdinLock<'_> {
Expand All @@ -31,6 +33,9 @@
fn buffered(self) -> Self::Buffered {
self
}
fn inner_file(&mut self) -> Option<&mut File> {
None
}

Check warning on line 38 in src/uu/wc/src/countable.rs

View check run for this annotation

Codecov / codecov/patch

src/uu/wc/src/countable.rs#L36-L38

Added lines #L36 - L38 were not covered by tests
}

impl WordCountable for File {
Expand All @@ -39,4 +44,8 @@
fn buffered(self) -> Self::Buffered {
BufReader::new(self)
}

fn inner_file(&mut self) -> Option<&mut File> {
Some(self)
}
}
16 changes: 16 additions & 0 deletions tests/by-util/test_wc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,14 @@ fn test_single_only_lines() {
.stdout_is("18 moby_dick.txt\n");
}

#[test]
fn test_single_only_bytes() {
new_ucmd!()
.args(&["-c", "lorem_ipsum.txt"])
.run()
.stdout_is("772 lorem_ipsum.txt\n");
}

#[test]
fn test_single_all_counts() {
new_ucmd!()
Expand Down Expand Up @@ -419,6 +427,14 @@ fn test_files_from_pseudo_filesystem() {
use pretty_assertions::assert_ne;
let result = new_ucmd!().arg("-c").arg("/proc/cpuinfo").succeeds();
assert_ne!(result.stdout_str(), "0 /proc/cpuinfo\n");

let (at, mut ucmd) = at_and_ucmd!();
let result = ucmd.arg("-c").arg("/sys/kernel/profiling").succeeds();
let actual = at.read("/sys/kernel/profiling").len();
assert_eq!(
result.stdout_str(),
format!("{} /sys/kernel/profiling\n", actual)
);
}

#[test]
Expand Down
Loading